From dca7d71022fd67455fbe6c503c12b2779e7ddab5 Mon Sep 17 00:00:00 2001 From: alay2shah Date: Tue, 24 Feb 2026 18:02:16 -0500 Subject: [PATCH 1/2] Fix code examples: update API patterns and sampling parameters - Update transformers examples to use return_dict=True and **inputs pattern (from notebook) - Fix sampling parameters in notebooks to match model card recommendations (LFM2.5-1.2B-Instruct: temp=0.1, top_k=50) - Add lm_head workaround for transformers v5 vision model bug - Regenerate quickstart snippets from updated sources to keep all examples in sync --- deployment/gpu-inference/transformers.mdx | 24 +++--- .../LFM2_Inference_with_Transformers.ipynb | 73 +------------------ notebooks/LFM2_Inference_with_vLLM.ipynb | 22 +----- notebooks/quickstart_snippets.ipynb | 6 +- scripts/generate_snippets.py | 9 ++- snippets/quickstart/text-transformers.mdx | 9 ++- snippets/quickstart/vl-transformers.mdx | 3 + 7 files changed, 35 insertions(+), 111 deletions(-) diff --git a/deployment/gpu-inference/transformers.mdx b/deployment/gpu-inference/transformers.mdx index 2dd9870..18e50ab 100644 --- a/deployment/gpu-inference/transformers.mdx +++ b/deployment/gpu-inference/transformers.mdx @@ -53,17 +53,18 @@ tokenizer = AutoTokenizer.from_pretrained(model_id) # Generate answer prompt = "What is C. elegans?" -input_ids = tokenizer.apply_chat_template( +inputs = tokenizer.apply_chat_template( [{"role": "user", "content": prompt}], add_generation_prompt=True, return_tensors="pt", - tokenize=True, + return_dict=True, ).to(model.device) -output = model.generate(input_ids, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512) +output = model.generate(**inputs, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512) # Decode only the newly generated tokens (excluding the input prompt) -response = tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True) +input_length = inputs["input_ids"].shape[1] +response = tokenizer.decode(output[0][input_length:], skip_special_tokens=True) print(response) # C. elegans, also known as Caenorhabditis elegans, is a small, free-living # nematode worm (roundworm) that belongs to the phylum Nematoda. @@ -135,7 +136,7 @@ generation_config = GenerationConfig( ) # Use it in generate() -output = model.generate(input_ids, generation_config=generation_config) +output = model.generate(**inputs, generation_config=generation_config) ``` For a complete list of parameters, see the [GenerationConfig documentation](https://huggingface.co/docs/transformers/v4.57.1/en/main_classes/text_generation#transformers.GenerationConfig). @@ -149,15 +150,15 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer # Use the model and tokenizer setup from Basic Usage above prompt = "Tell me a story about space exploration." -input_ids = tokenizer.apply_chat_template( +inputs = tokenizer.apply_chat_template( [{"role": "user", "content": prompt}], add_generation_prompt=True, return_tensors="pt", - tokenize=True, + return_dict=True, ).to(model.device) streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) -output = model.generate(input_ids, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, streamer=streamer, max_new_tokens=512) +output = model.generate(**inputs, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, streamer=streamer, max_new_tokens=512) ``` ## Batch Generation @@ -184,12 +185,12 @@ batch = tokenizer.apply_chat_template( prompts, add_generation_prompt=True, return_tensors="pt", - tokenize=True, + return_dict=True, padding=True, ).to(model.device) # Generate for all prompts in batch -outputs = model.generate(batch, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512) +outputs = model.generate(**batch, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512) # Decode outputs for output in outputs: @@ -211,6 +212,9 @@ model = AutoModelForImageTextToText.from_pretrained( device_map="auto", dtype="bfloat16" ) +# IMPORTANT: tie lm_head to input embeddings (transformers v5 bug) +model.lm_head.weight = model.get_input_embeddings().weight + processor = AutoProcessor.from_pretrained(model_id) # Load image and create conversation diff --git a/notebooks/LFM2_Inference_with_Transformers.ipynb b/notebooks/LFM2_Inference_with_Transformers.ipynb index eb6b96c..636ba48 100644 --- a/notebooks/LFM2_Inference_with_Transformers.ipynb +++ b/notebooks/LFM2_Inference_with_Transformers.ipynb @@ -86,30 +86,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "from transformers import GenerationConfig\n", - "\n", - "generation_config = GenerationConfig(\n", - " do_sample=True,\n", - " temperature=0.3,\n", - " min_p=0.15,\n", - " repetition_penalty=1.05,\n", - " max_new_tokens=512,\n", - ")\n", - "\n", - "prompt = \"Explain quantum computing in simple terms.\"\n", - "inputs = tokenizer.apply_chat_template(\n", - " [{\"role\": \"user\", \"content\": prompt}],\n", - " add_generation_prompt=True,\n", - " return_tensors=\"pt\",\n", - " return_dict=True,\n", - ").to(model.device)\n", - "\n", - "output = model.generate(**inputs, generation_config=generation_config)\n", - "input_length = inputs[\"input_ids\"].shape[1]\n", - "response = tokenizer.decode(output[0][input_length:], skip_special_tokens=True)\n", - "print(response)" - ] + "source": "from transformers import GenerationConfig\n\ngeneration_config = GenerationConfig(\n do_sample=True,\n temperature=0.1,\n top_k=50,\n repetition_penalty=1.05,\n max_new_tokens=512,\n)\n\nprompt = \"Explain quantum computing in simple terms.\"\ninputs = tokenizer.apply_chat_template(\n [{\"role\": \"user\", \"content\": prompt}],\n add_generation_prompt=True,\n return_tensors=\"pt\",\n return_dict=True,\n).to(model.device)\n\noutput = model.generate(**inputs, generation_config=generation_config)\ninput_length = inputs[\"input_ids\"].shape[1]\nresponse = tokenizer.decode(output[0][input_length:], skip_special_tokens=True)\nprint(response)" }, { "cell_type": "markdown", @@ -154,51 +131,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "from transformers import AutoProcessor, AutoModelForImageTextToText\n", - "from transformers.image_utils import load_image\n", - "\n", - "# Load vision model and processor\n", - "model_id = \"LiquidAI/LFM2.5-VL-1.6B\"\n", - "vision_model = AutoModelForImageTextToText.from_pretrained(\n", - " model_id,\n", - " device_map=\"auto\",\n", - " dtype=\"bfloat16\"\n", - ")\n", - "\n", - "# IMPORTANT: tie lm_head to input embeddings (transformers v5 bug)\n", - "vision_model.lm_head.weight = vision_model.get_input_embeddings().weight\n", - "\n", - "processor = AutoProcessor.from_pretrained(model_id)\n", - "\n", - "# Load image\n", - "url = \"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg\"\n", - "image = load_image(url)\n", - "\n", - "# Create conversation\n", - "conversation = [\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": [\n", - " {\"type\": \"image\", \"image\": image},\n", - " {\"type\": \"text\", \"text\": \"What is in this image?\"},\n", - " ],\n", - " },\n", - "]\n", - "\n", - "# Generate response\n", - "inputs = processor.apply_chat_template(\n", - " conversation,\n", - " add_generation_prompt=True,\n", - " return_tensors=\"pt\",\n", - " return_dict=True,\n", - " tokenize=True,\n", - ").to(vision_model.device)\n", - "\n", - "outputs = vision_model.generate(**inputs, max_new_tokens=64)\n", - "response = processor.batch_decode(outputs, skip_special_tokens=True)[0]\n", - "print(response)" - ] + "source": "from transformers import AutoProcessor, AutoModelForImageTextToText\nfrom transformers.image_utils import load_image\n\n# Load vision model and processor\nmodel_id = \"LiquidAI/LFM2.5-VL-1.6B\"\nvision_model = AutoModelForImageTextToText.from_pretrained(\n model_id,\n device_map=\"auto\",\n dtype=\"bfloat16\"\n)\n\n# IMPORTANT: tie lm_head to input embeddings (transformers v5 bug)\nvision_model.lm_head.weight = vision_model.get_input_embeddings().weight\n\nprocessor = AutoProcessor.from_pretrained(model_id)\n\n# Load image\nurl = \"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg\"\nimage = load_image(url)\n\n# Create conversation\nconversation = [\n {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image\", \"image\": image},\n {\"type\": \"text\", \"text\": \"What is in this image?\"},\n ],\n },\n]\n\n# Generate response\ninputs = processor.apply_chat_template(\n conversation,\n add_generation_prompt=True,\n return_tensors=\"pt\",\n return_dict=True,\n tokenize=True,\n).to(vision_model.device)\n\noutputs = vision_model.generate(**inputs, do_sample=True, temperature=0.1, min_p=0.15, repetition_penalty=1.05, max_new_tokens=64)\nresponse = processor.batch_decode(outputs, skip_special_tokens=True)[0]\nprint(response)" }, { "cell_type": "markdown", @@ -228,4 +161,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file diff --git a/notebooks/LFM2_Inference_with_vLLM.ipynb b/notebooks/LFM2_Inference_with_vLLM.ipynb index b38106f..ac72203 100644 --- a/notebooks/LFM2_Inference_with_vLLM.ipynb +++ b/notebooks/LFM2_Inference_with_vLLM.ipynb @@ -58,25 +58,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "from vllm import LLM, SamplingParams\n", - "\n", - "# Initialize the model\n", - "llm = LLM(model=\"LiquidAI/LFM2.5-1.2B-Instruct\")\n", - "\n", - "# Define sampling parameters\n", - "sampling_params = SamplingParams(\n", - " temperature=0.3,\n", - " min_p=0.15,\n", - " repetition_penalty=1.05,\n", - " max_tokens=512\n", - ")\n", - "\n", - "# Generate answer\n", - "messages = [{\"role\": \"user\", \"content\": \"What is C. elegans?\"}]\n", - "output = llm.chat(messages, sampling_params)\n", - "print(output[0].outputs[0].text)" - ] + "source": "from vllm import LLM, SamplingParams\n\n# Initialize the model\nllm = LLM(model=\"LiquidAI/LFM2.5-1.2B-Instruct\")\n\n# Define sampling parameters\nsampling_params = SamplingParams(\n temperature=0.1,\n top_k=50,\n repetition_penalty=1.05,\n max_tokens=512\n)\n\n# Generate answer\nmessages = [{\"role\": \"user\", \"content\": \"What is C. elegans?\"}]\noutput = llm.chat(messages, sampling_params)\nprint(output[0].outputs[0].text)" }, { "cell_type": "markdown", @@ -149,7 +131,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "from vllm import LLM, SamplingParams\nfrom typing import List, Dict, Any\n\ndef build_messages(parts):\n content = []\n for item in parts:\n if item[\"type\"] == \"text\":\n content.append({\"type\": \"text\", \"text\": item[\"value\"]})\n elif item[\"type\"] == \"image\":\n content.append({\"type\": \"image_url\", \"image_url\": {\"url\": item[\"value\"]}})\n return [{\"role\": \"user\", \"content\": content}]\n\nIMAGE_URL = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n\nllm = LLM(\n model=\"LiquidAI/LFM2.5-VL-1.6B\",\n max_model_len=1024,\n)\n\nsampling_params = SamplingParams(\n temperature=0.0,\n max_tokens=1024,\n)\n\n# Batch multiple prompts - text-only and multimodal\nprompts: List[List[Dict[str, Any]]] = [ # type: ignore[no-redef]\n [{\"type\": \"text\", \"value\": \"What is C. elegans?\"}],\n [{\"type\": \"text\", \"value\": \"Say hi in JSON format\"}],\n [\n {\"type\": \"image\", \"value\": IMAGE_URL},\n {\"type\": \"text\", \"value\": \"Describe what you see in this image.\"},\n ],\n]\n\nconversations = [build_messages(p) for p in prompts]\noutputs = llm.chat(conversations, sampling_params)\n\nfor output in outputs:\n print(output.outputs[0].text)\n print(\"---\")" + "source": "from vllm import LLM, SamplingParams\nfrom typing import List, Dict, Any\n\ndef build_messages(parts):\n content = []\n for item in parts:\n if item[\"type\"] == \"text\":\n content.append({\"type\": \"text\", \"text\": item[\"value\"]})\n elif item[\"type\"] == \"image\":\n content.append({\"type\": \"image_url\", \"image_url\": {\"url\": item[\"value\"]}})\n return [{\"role\": \"user\", \"content\": content}]\n\nIMAGE_URL = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n\nllm = LLM(\n model=\"LiquidAI/LFM2.5-VL-1.6B\",\n max_model_len=1024,\n)\n\nsampling_params = SamplingParams(\n temperature=0.1,\n min_p=0.15,\n repetition_penalty=1.05,\n max_tokens=1024,\n)\n\n# Batch multiple prompts - text-only and multimodal\nprompts: List[List[Dict[str, Any]]] = [ # type: ignore[no-redef]\n [{\"type\": \"text\", \"value\": \"What is C. elegans?\"}],\n [{\"type\": \"text\", \"value\": \"Say hi in JSON format\"}],\n [\n {\"type\": \"image\", \"value\": IMAGE_URL},\n {\"type\": \"text\", \"value\": \"Describe what you see in this image.\"},\n ],\n]\n\nconversations = [build_messages(p) for p in prompts]\noutputs = llm.chat(conversations, sampling_params)\n\nfor output in outputs:\n print(output.outputs[0].text)\n print(\"---\")" }, { "cell_type": "markdown", diff --git a/notebooks/quickstart_snippets.ipynb b/notebooks/quickstart_snippets.ipynb index 85ce945..85e4255 100644 --- a/notebooks/quickstart_snippets.ipynb +++ b/notebooks/quickstart_snippets.ipynb @@ -32,7 +32,7 @@ "snippet": "text-transformers" }, "outputs": [], - "source": "from transformers import AutoModelForCausalLM, AutoTokenizer\n\nmodel_id = \"LiquidAI/LFM2.5-1.2B-Instruct\"\nmodel = AutoModelForCausalLM.from_pretrained(\n model_id,\n device_map=\"auto\",\n dtype=\"bfloat16\",\n)\ntokenizer = AutoTokenizer.from_pretrained(model_id)\n\ninput_ids = tokenizer.apply_chat_template(\n [{\"role\": \"user\", \"content\": \"What is machine learning?\"}],\n add_generation_prompt=True,\n return_tensors=\"pt\",\n tokenize=True,\n).to(model.device)\n\noutput = model.generate(input_ids, do_sample=True, temperature=0.1, top_k=50, top_p=0.1, repetition_penalty=1.05, max_new_tokens=512)\nresponse = tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)\nprint(response)" + "source": "from transformers import AutoModelForCausalLM, AutoTokenizer\n\nmodel_id = \"LiquidAI/LFM2.5-1.2B-Instruct\"\nmodel = AutoModelForCausalLM.from_pretrained(\n model_id,\n device_map=\"auto\",\n dtype=\"bfloat16\",\n)\ntokenizer = AutoTokenizer.from_pretrained(model_id)\n\ninputs = tokenizer.apply_chat_template(\n [{\"role\": \"user\", \"content\": \"What is machine learning?\"}],\n add_generation_prompt=True,\n return_tensors=\"pt\",\n return_dict=True,\n).to(model.device)\n\noutput = model.generate(**inputs, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512)\ninput_length = inputs[\"input_ids\"].shape[1]\nresponse = tokenizer.decode(output[0][input_length:], skip_special_tokens=True)\nprint(response)" }, { "cell_type": "code", @@ -41,7 +41,7 @@ "snippet": "text-vllm" }, "outputs": [], - "source": "from vllm import LLM, SamplingParams\n\nllm = LLM(model=\"LiquidAI/LFM2.5-1.2B-Instruct\")\n\nsampling_params = SamplingParams(\n temperature=0.1,\n top_k=50,\n top_p=0.1,\n repetition_penalty=1.05,\n max_tokens=512,\n)\n\noutput = llm.chat(\"What is machine learning?\", sampling_params)\nprint(output[0].outputs[0].text)" + "source": "from vllm import LLM, SamplingParams\n\nllm = LLM(model=\"LiquidAI/LFM2.5-1.2B-Instruct\")\n\nsampling_params = SamplingParams(\n temperature=0.1,\n top_k=50,\n repetition_penalty=1.05,\n max_tokens=512,\n)\n\noutput = llm.chat(\"What is machine learning?\", sampling_params)\nprint(output[0].outputs[0].text)" }, { "cell_type": "markdown", @@ -57,7 +57,7 @@ "snippet": "vl-transformers" }, "outputs": [], - "source": "from transformers import AutoProcessor, AutoModelForImageTextToText\nfrom transformers.image_utils import load_image\n\nmodel_id = \"LiquidAI/LFM2.5-VL-1.6B\"\nmodel = AutoModelForImageTextToText.from_pretrained(\n model_id,\n device_map=\"auto\",\n dtype=\"bfloat16\",\n)\nprocessor = AutoProcessor.from_pretrained(model_id)\n\nurl = \"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg\"\nimage = load_image(url)\n\nconversation = [\n {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image\", \"image\": image},\n {\"type\": \"text\", \"text\": \"What is in this image?\"},\n ],\n },\n]\n\ninputs = processor.apply_chat_template(\n conversation,\n add_generation_prompt=True,\n return_tensors=\"pt\",\n return_dict=True,\n tokenize=True,\n).to(model.device)\n\noutputs = model.generate(**inputs, do_sample=True, temperature=0.1, min_p=0.15, repetition_penalty=1.05, max_new_tokens=256)\nresponse = processor.batch_decode(outputs, skip_special_tokens=True)[0]\nprint(response)" + "source": "from transformers import AutoProcessor, AutoModelForImageTextToText\nfrom transformers.image_utils import load_image\n\nmodel_id = \"LiquidAI/LFM2.5-VL-1.6B\"\nmodel = AutoModelForImageTextToText.from_pretrained(\n model_id,\n device_map=\"auto\",\n dtype=\"bfloat16\",\n)\n# IMPORTANT: tie lm_head to input embeddings (transformers v5 bug)\nmodel.lm_head.weight = model.get_input_embeddings().weight\n\nprocessor = AutoProcessor.from_pretrained(model_id)\n\nurl = \"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg\"\nimage = load_image(url)\n\nconversation = [\n {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image\", \"image\": image},\n {\"type\": \"text\", \"text\": \"What is in this image?\"},\n ],\n },\n]\n\ninputs = processor.apply_chat_template(\n conversation,\n add_generation_prompt=True,\n return_tensors=\"pt\",\n return_dict=True,\n tokenize=True,\n).to(model.device)\n\noutputs = model.generate(**inputs, do_sample=True, temperature=0.1, min_p=0.15, repetition_penalty=1.05, max_new_tokens=256)\nresponse = processor.batch_decode(outputs, skip_special_tokens=True)[0]\nprint(response)" }, { "cell_type": "code", diff --git a/scripts/generate_snippets.py b/scripts/generate_snippets.py index 5729110..fdd78b6 100644 --- a/scripts/generate_snippets.py +++ b/scripts/generate_snippets.py @@ -59,15 +59,16 @@ ")\n" "tokenizer = AutoTokenizer.from_pretrained(model_id)\n" "\n" - "input_ids = tokenizer.apply_chat_template(\n" + "inputs = tokenizer.apply_chat_template(\n" ' [{"role": "user", "content": "What is machine learning?"}],\n' " add_generation_prompt=True,\n" ' return_tensors="pt",\n' - " tokenize=True,\n" + " return_dict=True,\n" ").to(model.device)\n" "\n" - "output = model.generate(input_ids, ${samplingParams}max_new_tokens=512)\n" - "response = tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)\n" + "output = model.generate(**inputs, ${samplingParams}max_new_tokens=512)\n" + 'input_length = inputs["input_ids"].shape[1]\n' + "response = tokenizer.decode(output[0][input_length:], skip_special_tokens=True)\n" "print(response)" )}, ], diff --git a/snippets/quickstart/text-transformers.mdx b/snippets/quickstart/text-transformers.mdx index 8affb7b..d05b35d 100644 --- a/snippets/quickstart/text-transformers.mdx +++ b/snippets/quickstart/text-transformers.mdx @@ -19,15 +19,16 @@ model = AutoModelForCausalLM.from_pretrained( ) tokenizer = AutoTokenizer.from_pretrained(model_id) -input_ids = tokenizer.apply_chat_template( +inputs = tokenizer.apply_chat_template( [{"role": "user", "content": "What is machine learning?"}], add_generation_prompt=True, return_tensors="pt", - tokenize=True, + return_dict=True, ).to(model.device) -output = model.generate(input_ids, ${samplingParams}max_new_tokens=512) -response = tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True) +output = model.generate(**inputs, ${samplingParams}max_new_tokens=512) +input_length = inputs["input_ids"].shape[1] +response = tokenizer.decode(output[0][input_length:], skip_special_tokens=True) print(response)`.split('\n').map((line, i) => {line}{'\n'})} diff --git a/snippets/quickstart/vl-transformers.mdx b/snippets/quickstart/vl-transformers.mdx index 8dedb41..782c4f0 100644 --- a/snippets/quickstart/vl-transformers.mdx +++ b/snippets/quickstart/vl-transformers.mdx @@ -26,6 +26,9 @@ model = AutoModelForImageTextToText.from_pretrained( device_map="auto", dtype="bfloat16", ) +# IMPORTANT: tie lm_head to input embeddings (transformers v5 bug) +model.lm_head.weight = model.get_input_embeddings().weight + processor = AutoProcessor.from_pretrained(model_id) url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" From e5db8194b0f090d00c65af4d49b8697f166563dd Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Tue, 24 Feb 2026 19:30:48 -0500 Subject: [PATCH 2/2] coherence --- deployment/gpu-inference/transformers.mdx | 11 ++++------- lfm/models/lfm2-1.2b-extract.mdx | 4 ++-- lfm/models/lfm2-1.2b-rag.mdx | 4 ++-- lfm/models/lfm2-2.6b-transcript.mdx | 4 ++-- lfm/models/lfm2-350m-enjp-mt.mdx | 8 ++++---- lfm/models/lfm2-350m-extract.mdx | 4 ++-- lfm/models/lfm2-350m-math.mdx | 4 ++-- lfm/models/lfm2-350m-pii-extract-jp.mdx | 4 ++-- scripts/generate_snippets.py | 3 +-- snippets/quickstart/text-transformers.mdx | 3 +-- 10 files changed, 22 insertions(+), 27 deletions(-) diff --git a/deployment/gpu-inference/transformers.mdx b/deployment/gpu-inference/transformers.mdx index 8765f6d..f5cf185 100644 --- a/deployment/gpu-inference/transformers.mdx +++ b/deployment/gpu-inference/transformers.mdx @@ -59,8 +59,7 @@ inputs = tokenizer.apply_chat_template( return_tensors="pt", tokenize=True, return_dict=True, -) -input_ids = inputs["input_ids"].to(model.device) +).to(model.device) output = model.generate(**inputs, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512) @@ -158,8 +157,7 @@ inputs = tokenizer.apply_chat_template( return_tensors="pt", tokenize=True, return_dict=True, -) -input_ids = inputs["input_ids"].to(model.device) +).to(model.device) streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) output = model.generate(**inputs, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, streamer=streamer, max_new_tokens=512) @@ -189,11 +187,10 @@ inputs = tokenizer.apply_chat_template( prompts, add_generation_prompt=True, return_tensors="pt", - return_dict=True, + tokenize=True, padding=True, return_dict=True, -) -inputs = {k: v.to(model.device) for k, v in inputs.items()} +).to(model.device) # Generate for all prompts in batch outputs = model.generate(**inputs, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512) diff --git a/lfm/models/lfm2-1.2b-extract.mdx b/lfm/models/lfm2-1.2b-extract.mdx index eccaa68..fd0fd23 100644 --- a/lfm/models/lfm2-1.2b-extract.mdx +++ b/lfm/models/lfm2-1.2b-extract.mdx @@ -93,8 +93,8 @@ If no system prompt is provided, defaults to JSON. Specify format (JSON, XML, or {"role": "user", "content": user_input} ] - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) - outputs = model.generate(inputs, max_new_tokens=256, temperature=0, do_sample=False) + inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device) + outputs = model.generate(**inputs, max_new_tokens=256, temperature=0, do_sample=False) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) ``` diff --git a/lfm/models/lfm2-1.2b-rag.mdx b/lfm/models/lfm2-1.2b-rag.mdx index bf219d5..883b832 100644 --- a/lfm/models/lfm2-1.2b-rag.mdx +++ b/lfm/models/lfm2-1.2b-rag.mdx @@ -94,8 +94,8 @@ The following documents may provide you additional information to answer questio {"role": "user", "content": user_input} ] - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) - outputs = model.generate(inputs, max_new_tokens=256, temperature=0, do_sample=False) + inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device) + outputs = model.generate(**inputs, max_new_tokens=256, temperature=0, do_sample=False) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) # Output: The library serves 48 scientists and 85 technicians, along with many visiting staff and students. diff --git a/lfm/models/lfm2-2.6b-transcript.mdx b/lfm/models/lfm2-2.6b-transcript.mdx index c1afe91..83d86cc 100644 --- a/lfm/models/lfm2-2.6b-transcript.mdx +++ b/lfm/models/lfm2-2.6b-transcript.mdx @@ -121,8 +121,8 @@ Participants: Names (Roles) {"role": "user", "content": user_input} ] - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) - outputs = model.generate(inputs, max_new_tokens=512, temperature=0.3, do_sample=True) + inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device) + outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.3, do_sample=True) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) ``` diff --git a/lfm/models/lfm2-350m-enjp-mt.mdx b/lfm/models/lfm2-350m-enjp-mt.mdx index 3d5efd6..e8fa2b0 100644 --- a/lfm/models/lfm2-350m-enjp-mt.mdx +++ b/lfm/models/lfm2-350m-enjp-mt.mdx @@ -74,8 +74,8 @@ This model requires a specific system prompt to specify translation direction. S {"role": "user", "content": "What is C. elegans?"} ] - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) - outputs = model.generate(inputs, max_new_tokens=256) + inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device) + outputs = model.generate(**inputs, max_new_tokens=256) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) # Output: C. elegansとは何ですか? @@ -88,8 +88,8 @@ This model requires a specific system prompt to specify translation direction. S {"role": "user", "content": "今日は天気がいいですね。"} ] - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) - outputs = model.generate(inputs, max_new_tokens=256) + inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device) + outputs = model.generate(**inputs, max_new_tokens=256) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) # Output: The weather is nice today. diff --git a/lfm/models/lfm2-350m-extract.mdx b/lfm/models/lfm2-350m-extract.mdx index 9162c46..4a537e7 100644 --- a/lfm/models/lfm2-350m-extract.mdx +++ b/lfm/models/lfm2-350m-extract.mdx @@ -91,8 +91,8 @@ Schema: {"role": "user", "content": user_input} ] - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) - outputs = model.generate(inputs, max_new_tokens=256, temperature=0, do_sample=False) + inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device) + outputs = model.generate(**inputs, max_new_tokens=256, temperature=0, do_sample=False) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) ``` diff --git a/lfm/models/lfm2-350m-math.mdx b/lfm/models/lfm2-350m-math.mdx index 62db0e1..53aa6df 100644 --- a/lfm/models/lfm2-350m-math.mdx +++ b/lfm/models/lfm2-350m-math.mdx @@ -60,8 +60,8 @@ LFM2-350M-Math is a tiny reasoning model optimized for mathematical problem solv {"role": "user", "content": "If a train travels at 60 mph for 2.5 hours, how far does it travel?"} ] - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) - outputs = model.generate(inputs, max_new_tokens=256) + inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device) + outputs = model.generate(**inputs, max_new_tokens=256) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) ``` diff --git a/lfm/models/lfm2-350m-pii-extract-jp.mdx b/lfm/models/lfm2-350m-pii-extract-jp.mdx index e24748c..c684ac4 100644 --- a/lfm/models/lfm2-350m-pii-extract-jp.mdx +++ b/lfm/models/lfm2-350m-pii-extract-jp.mdx @@ -83,8 +83,8 @@ Extract specific entities by listing only what you need (e.g., `Extract