Liquid4All · ykhrustalev · Feb 25, 2026 · Feb 24, 2026 · Feb 25, 2026 · Feb 25, 2026
@@ -1,6 +1,6 @@
 ---
 title: "Transformers"
 description: "Transformers is a library for inference and training of pretrained models."
 ---

 <Tip>
@@ -59,13 +59,13 @@
     return_tensors="pt",
     tokenize=True,
     return_dict=True,
-)
-input_ids = inputs["input_ids"].to(model.device)
+).to(model.device)
 
-output = model.generate(input_ids, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512)
+output = model.generate(**inputs, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512)
 
 # Decode only the newly generated tokens (excluding the input prompt)
-response = tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)
+input_length = inputs["input_ids"].shape[1]
+response = tokenizer.decode(output[0][input_length:], skip_special_tokens=True)
 print(response)
 # C. elegans, also known as Caenorhabditis elegans, is a small, free-living
 # nematode worm (roundworm) that belongs to the phylum Nematoda.
@@ -75,7 +75,7 @@

 * **`model_id`**: Can be a Hugging Face model ID (e.g., `"LiquidAI/LFM2.5-1.2B-Instruct"`) or a local path
 * **`device_map="auto"`**: Automatically distributes across available GPUs/CPU (requires `accelerate`). Use `device="cuda"` for single GPU or `device="cpu"` for CPU only
 * **`dtype="bfloat16"`**: Recommended for modern GPUs. Use `"auto"` for automatic selection, or `"float32"` (slower, more memory)

 <Accordion title="Click to see a pipeline() example">
  The [`pipeline()`](https://huggingface.co/docs/transformers/en/main_classes/pipelines) interface provides a simpler API for text generation with automatic chat template handling. It wraps model loading and tokenization, making it ideal for quick prototyping.
@@ -115,7 +115,7 @@

 * **`do_sample`** (`bool`): Enable sampling (`True`) or greedy decoding (`False`, default)
 * **`temperature`** (`float`, default 1.0): Controls randomness (0.0 = deterministic, higher = more random). Typical range: 0.1-2.0
 * **`top_p`** (`float`, default 1.0): Nucleus sampling - limits to tokens with cumulative probability ≤ top\_p. Typical range: 0.1-1.0
 * **`top_k`** (`int`, default 50): Limits to top-k most probable tokens. Typical range: 1-100
 * **`min_p`** (`float`): Minimum token probability threshold. Typical range: 0.01-0.2
 * **`max_new_tokens`** (`int`): Maximum number of tokens to generate (preferred over `max_length`)
@@ -137,7 +137,7 @@
 )
 
 # Use it in generate()
-output = model.generate(input_ids, generation_config=generation_config)
+output = model.generate(**inputs, generation_config=generation_config)
 ```
 
 For a complete list of parameters, see the [GenerationConfig documentation](https://huggingface.co/docs/transformers/v4.57.1/en/main_classes/text_generation#transformers.GenerationConfig).
@@ -157,11 +157,10 @@
     return_tensors="pt",
     tokenize=True,
     return_dict=True,
-)
-input_ids = inputs["input_ids"].to(model.device)
+).to(model.device)
 
 streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-output = model.generate(input_ids, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, streamer=streamer, max_new_tokens=512)
+output = model.generate(**inputs, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, streamer=streamer, max_new_tokens=512)
 ```
 
 ## Batch Generation
@@ -191,8 +190,7 @@
     tokenize=True,
     padding=True,
     return_dict=True,
-)
-inputs = {k: v.to(model.device) for k, v in inputs.items()}
+).to(model.device)
 
 # Generate for all prompts in batch
 outputs = model.generate(**inputs, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512)
@@ -217,6 +215,9 @@
     device_map="auto",
     dtype="bfloat16"
 )
+# IMPORTANT: tie lm_head to input embeddings (transformers v5 bug)
+model.lm_head.weight = model.get_input_embeddings().weight
+
 processor = AutoProcessor.from_pretrained(model_id)
 
 # Load image and create conversation

@@ -93,8 +93,8 @@ If no system prompt is provided, defaults to JSON. Specify format (JSON, XML, or
         {"role": "user", "content": user_input}
     ]
 
-    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
-    outputs = model.generate(inputs, max_new_tokens=256, temperature=0, do_sample=False)
+    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=256, temperature=0, do_sample=False)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     print(response)
     ```

@@ -94,8 +94,8 @@ The following documents may provide you additional information to answer questio
         {"role": "user", "content": user_input}
     ]
 
-    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
-    outputs = model.generate(inputs, max_new_tokens=256, temperature=0, do_sample=False)
+    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=256, temperature=0, do_sample=False)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     print(response)
     # Output: The library serves 48 scientists and 85 technicians, along with many visiting staff and students.

@@ -121,8 +121,8 @@ Participants: Names (Roles)
         {"role": "user", "content": user_input}
     ]
 
-    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
-    outputs = model.generate(inputs, max_new_tokens=512, temperature=0.3, do_sample=True)
+    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.3, do_sample=True)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     print(response)
     ```

@@ -74,8 +74,8 @@ This model requires a specific system prompt to specify translation direction. S
         {"role": "user", "content": "What is C. elegans?"}
     ]
 
-    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
-    outputs = model.generate(inputs, max_new_tokens=256)
+    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=256)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     print(response)
     # Output: C. elegansとは何ですか？
@@ -88,8 +88,8 @@ This model requires a specific system prompt to specify translation direction. S
         {"role": "user", "content": "今日は天気がいいですね。"}
     ]
 
-    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
-    outputs = model.generate(inputs, max_new_tokens=256)
+    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=256)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     print(response)
     # Output: The weather is nice today.

@@ -91,8 +91,8 @@ Schema:
         {"role": "user", "content": user_input}
     ]
 
-    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
-    outputs = model.generate(inputs, max_new_tokens=256, temperature=0, do_sample=False)
+    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=256, temperature=0, do_sample=False)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     print(response)
     ```

@@ -60,8 +60,8 @@ LFM2-350M-Math is a tiny reasoning model optimized for mathematical problem solv
         {"role": "user", "content": "If a train travels at 60 mph for 2.5 hours, how far does it travel?"}
     ]
 
-    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
-    outputs = model.generate(inputs, max_new_tokens=256)
+    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=256)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     print(response)
     ```

@@ -83,8 +83,8 @@ Extract specific entities by listing only what you need (e.g., `Extract <human_n
         {"role": "user", "content": user_input}
     ]
 
-    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
-    outputs = model.generate(inputs, max_new_tokens=256, temperature=0, do_sample=False)
+    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=256, temperature=0, do_sample=False)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     print(response)
     # Output: {"address": [], "company_name": [], "email_address": ["celegans@liquid.ai"],

@@ -86,30 +86,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "from transformers import GenerationConfig\n",
-    "\n",
-    "generation_config = GenerationConfig(\n",
-    "    do_sample=True,\n",
-    "    temperature=0.3,\n",
-    "    min_p=0.15,\n",
-    "    repetition_penalty=1.05,\n",
-    "    max_new_tokens=512,\n",
-    ")\n",
-    "\n",
-    "prompt = \"Explain quantum computing in simple terms.\"\n",
-    "inputs = tokenizer.apply_chat_template(\n",
-    "    [{\"role\": \"user\", \"content\": prompt}],\n",
-    "    add_generation_prompt=True,\n",
-    "    return_tensors=\"pt\",\n",
-    "    return_dict=True,\n",
-    ").to(model.device)\n",
-    "\n",
-    "output = model.generate(**inputs, generation_config=generation_config)\n",
-    "input_length = inputs[\"input_ids\"].shape[1]\n",
-    "response = tokenizer.decode(output[0][input_length:], skip_special_tokens=True)\n",
-    "print(response)"
-   ]
+   "source": "from transformers import GenerationConfig\n\ngeneration_config = GenerationConfig(\n    do_sample=True,\n    temperature=0.1,\n    top_k=50,\n    repetition_penalty=1.05,\n    max_new_tokens=512,\n)\n\nprompt = \"Explain quantum computing in simple terms.\"\ninputs = tokenizer.apply_chat_template(\n    [{\"role\": \"user\", \"content\": prompt}],\n    add_generation_prompt=True,\n    return_tensors=\"pt\",\n    return_dict=True,\n).to(model.device)\n\noutput = model.generate(**inputs, generation_config=generation_config)\ninput_length = inputs[\"input_ids\"].shape[1]\nresponse = tokenizer.decode(output[0][input_length:], skip_special_tokens=True)\nprint(response)"
   },
   {
    "cell_type": "markdown",
@@ -154,51 +131,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "from transformers import AutoProcessor, AutoModelForImageTextToText\n",
-    "from transformers.image_utils import load_image\n",
-    "\n",
-    "# Load vision model and processor\n",
-    "model_id = \"LiquidAI/LFM2.5-VL-1.6B\"\n",
-    "vision_model = AutoModelForImageTextToText.from_pretrained(\n",
-    "    model_id,\n",
-    "    device_map=\"auto\",\n",
-    "    dtype=\"bfloat16\"\n",
-    ")\n",
-    "\n",
-    "# IMPORTANT: tie lm_head to input embeddings (transformers v5 bug)\n",
-    "vision_model.lm_head.weight = vision_model.get_input_embeddings().weight\n",
-    "\n",
-    "processor = AutoProcessor.from_pretrained(model_id)\n",
-    "\n",
-    "# Load image\n",
-    "url = \"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg\"\n",
-    "image = load_image(url)\n",
-    "\n",
-    "# Create conversation\n",
-    "conversation = [\n",
-    "    {\n",
-    "        \"role\": \"user\",\n",
-    "        \"content\": [\n",
-    "            {\"type\": \"image\", \"image\": image},\n",
-    "            {\"type\": \"text\", \"text\": \"What is in this image?\"},\n",
-    "        ],\n",
-    "    },\n",
-    "]\n",
-    "\n",
-    "# Generate response\n",
-    "inputs = processor.apply_chat_template(\n",
-    "    conversation,\n",
-    "    add_generation_prompt=True,\n",
-    "    return_tensors=\"pt\",\n",
-    "    return_dict=True,\n",
-    "    tokenize=True,\n",
-    ").to(vision_model.device)\n",
-    "\n",
-    "outputs = vision_model.generate(**inputs, max_new_tokens=64)\n",
-    "response = processor.batch_decode(outputs, skip_special_tokens=True)[0]\n",
-    "print(response)"
-   ]
+   "source": "from transformers import AutoProcessor, AutoModelForImageTextToText\nfrom transformers.image_utils import load_image\n\n# Load vision model and processor\nmodel_id = \"LiquidAI/LFM2.5-VL-1.6B\"\nvision_model = AutoModelForImageTextToText.from_pretrained(\n    model_id,\n    device_map=\"auto\",\n    dtype=\"bfloat16\"\n)\n\n# IMPORTANT: tie lm_head to input embeddings (transformers v5 bug)\nvision_model.lm_head.weight = vision_model.get_input_embeddings().weight\n\nprocessor = AutoProcessor.from_pretrained(model_id)\n\n# Load image\nurl = \"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg\"\nimage = load_image(url)\n\n# Create conversation\nconversation = [\n    {\n        \"role\": \"user\",\n        \"content\": [\n            {\"type\": \"image\", \"image\": image},\n            {\"type\": \"text\", \"text\": \"What is in this image?\"},\n        ],\n    },\n]\n\n# Generate response\ninputs = processor.apply_chat_template(\n    conversation,\n    add_generation_prompt=True,\n    return_tensors=\"pt\",\n    return_dict=True,\n    tokenize=True,\n).to(vision_model.device)\n\noutputs = vision_model.generate(**inputs, do_sample=True, temperature=0.1, min_p=0.15, repetition_penalty=1.05, max_new_tokens=64)\nresponse = processor.batch_decode(outputs, skip_special_tokens=True)[0]\nprint(response)"
   },
   {
    "cell_type": "markdown",
@@ -228,4 +161,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}
@@ -58,25 +58,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "from vllm import LLM, SamplingParams\n",
-    "\n",
-    "# Initialize the model\n",
-    "llm = LLM(model=\"LiquidAI/LFM2.5-1.2B-Instruct\")\n",
-    "\n",
-    "# Define sampling parameters\n",
-    "sampling_params = SamplingParams(\n",
-    "    temperature=0.3,\n",
-    "    min_p=0.15,\n",
-    "    repetition_penalty=1.05,\n",
-    "    max_tokens=512\n",
-    ")\n",
-    "\n",
-    "# Generate answer\n",
-    "messages = [{\"role\": \"user\", \"content\": \"What is C. elegans?\"}]\n",
-    "output = llm.chat(messages, sampling_params)\n",
-    "print(output[0].outputs[0].text)"
-   ]
+   "source": "from vllm import LLM, SamplingParams\n\n# Initialize the model\nllm = LLM(model=\"LiquidAI/LFM2.5-1.2B-Instruct\")\n\n# Define sampling parameters\nsampling_params = SamplingParams(\n    temperature=0.1,\n    top_k=50,\n    repetition_penalty=1.05,\n    max_tokens=512\n)\n\n# Generate answer\nmessages = [{\"role\": \"user\", \"content\": \"What is C. elegans?\"}]\noutput = llm.chat(messages, sampling_params)\nprint(output[0].outputs[0].text)"
   },
   {
    "cell_type": "markdown",
@@ -149,7 +131,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "from vllm import LLM, SamplingParams\nfrom typing import List, Dict, Any\n\ndef build_messages(parts):\n    content = []\n    for item in parts:\n        if item[\"type\"] == \"text\":\n            content.append({\"type\": \"text\", \"text\": item[\"value\"]})\n        elif item[\"type\"] == \"image\":\n            content.append({\"type\": \"image_url\", \"image_url\": {\"url\": item[\"value\"]}})\n    return [{\"role\": \"user\", \"content\": content}]\n\nIMAGE_URL = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n\nllm = LLM(\n    model=\"LiquidAI/LFM2.5-VL-1.6B\",\n    max_model_len=1024,\n)\n\nsampling_params = SamplingParams(\n    temperature=0.0,\n    max_tokens=1024,\n)\n\n# Batch multiple prompts - text-only and multimodal\nprompts: List[List[Dict[str, Any]]] = [  # type: ignore[no-redef]\n    [{\"type\": \"text\", \"value\": \"What is C. elegans?\"}],\n    [{\"type\": \"text\", \"value\": \"Say hi in JSON format\"}],\n    [\n        {\"type\": \"image\", \"value\": IMAGE_URL},\n        {\"type\": \"text\", \"value\": \"Describe what you see in this image.\"},\n    ],\n]\n\nconversations = [build_messages(p) for p in prompts]\noutputs = llm.chat(conversations, sampling_params)\n\nfor output in outputs:\n    print(output.outputs[0].text)\n    print(\"---\")"
+   "source": "from vllm import LLM, SamplingParams\nfrom typing import List, Dict, Any\n\ndef build_messages(parts):\n    content = []\n    for item in parts:\n        if item[\"type\"] == \"text\":\n            content.append({\"type\": \"text\", \"text\": item[\"value\"]})\n        elif item[\"type\"] == \"image\":\n            content.append({\"type\": \"image_url\", \"image_url\": {\"url\": item[\"value\"]}})\n    return [{\"role\": \"user\", \"content\": content}]\n\nIMAGE_URL = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n\nllm = LLM(\n    model=\"LiquidAI/LFM2.5-VL-1.6B\",\n    max_model_len=1024,\n)\n\nsampling_params = SamplingParams(\n    temperature=0.1,\n    min_p=0.15,\n    repetition_penalty=1.05,\n    max_tokens=1024,\n)\n\n# Batch multiple prompts - text-only and multimodal\nprompts: List[List[Dict[str, Any]]] = [  # type: ignore[no-redef]\n    [{\"type\": \"text\", \"value\": \"What is C. elegans?\"}],\n    [{\"type\": \"text\", \"value\": \"Say hi in JSON format\"}],\n    [\n        {\"type\": \"image\", \"value\": IMAGE_URL},\n        {\"type\": \"text\", \"value\": \"Describe what you see in this image.\"},\n    ],\n]\n\nconversations = [build_messages(p) for p in prompts]\noutputs = llm.chat(conversations, sampling_params)\n\nfor output in outputs:\n    print(output.outputs[0].text)\n    print(\"---\")"
   },
   {
    "cell_type": "markdown",