diff --git a/deployment/gpu-inference/transformers.mdx b/deployment/gpu-inference/transformers.mdx index 0aeb91d..f5cf185 100644 --- a/deployment/gpu-inference/transformers.mdx +++ b/deployment/gpu-inference/transformers.mdx @@ -59,13 +59,13 @@ inputs = tokenizer.apply_chat_template( return_tensors="pt", tokenize=True, return_dict=True, -) -input_ids = inputs["input_ids"].to(model.device) +).to(model.device) -output = model.generate(input_ids, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512) +output = model.generate(**inputs, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512) # Decode only the newly generated tokens (excluding the input prompt) -response = tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True) +input_length = inputs["input_ids"].shape[1] +response = tokenizer.decode(output[0][input_length:], skip_special_tokens=True) print(response) # C. elegans, also known as Caenorhabditis elegans, is a small, free-living # nematode worm (roundworm) that belongs to the phylum Nematoda. @@ -137,7 +137,7 @@ generation_config = GenerationConfig( ) # Use it in generate() -output = model.generate(input_ids, generation_config=generation_config) +output = model.generate(**inputs, generation_config=generation_config) ``` For a complete list of parameters, see the [GenerationConfig documentation](https://huggingface.co/docs/transformers/v4.57.1/en/main_classes/text_generation#transformers.GenerationConfig). @@ -157,11 +157,10 @@ inputs = tokenizer.apply_chat_template( return_tensors="pt", tokenize=True, return_dict=True, -) -input_ids = inputs["input_ids"].to(model.device) +).to(model.device) streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) -output = model.generate(input_ids, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, streamer=streamer, max_new_tokens=512) +output = model.generate(**inputs, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, streamer=streamer, max_new_tokens=512) ``` ## Batch Generation @@ -191,8 +190,7 @@ inputs = tokenizer.apply_chat_template( tokenize=True, padding=True, return_dict=True, -) -inputs = {k: v.to(model.device) for k, v in inputs.items()} +).to(model.device) # Generate for all prompts in batch outputs = model.generate(**inputs, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512) @@ -217,6 +215,9 @@ model = AutoModelForImageTextToText.from_pretrained( device_map="auto", dtype="bfloat16" ) +# IMPORTANT: tie lm_head to input embeddings (transformers v5 bug) +model.lm_head.weight = model.get_input_embeddings().weight + processor = AutoProcessor.from_pretrained(model_id) # Load image and create conversation diff --git a/lfm/models/lfm2-1.2b-extract.mdx b/lfm/models/lfm2-1.2b-extract.mdx index eccaa68..fd0fd23 100644 --- a/lfm/models/lfm2-1.2b-extract.mdx +++ b/lfm/models/lfm2-1.2b-extract.mdx @@ -93,8 +93,8 @@ If no system prompt is provided, defaults to JSON. Specify format (JSON, XML, or {"role": "user", "content": user_input} ] - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) - outputs = model.generate(inputs, max_new_tokens=256, temperature=0, do_sample=False) + inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device) + outputs = model.generate(**inputs, max_new_tokens=256, temperature=0, do_sample=False) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) ``` diff --git a/lfm/models/lfm2-1.2b-rag.mdx b/lfm/models/lfm2-1.2b-rag.mdx index bf219d5..883b832 100644 --- a/lfm/models/lfm2-1.2b-rag.mdx +++ b/lfm/models/lfm2-1.2b-rag.mdx @@ -94,8 +94,8 @@ The following documents may provide you additional information to answer questio {"role": "user", "content": user_input} ] - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) - outputs = model.generate(inputs, max_new_tokens=256, temperature=0, do_sample=False) + inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device) + outputs = model.generate(**inputs, max_new_tokens=256, temperature=0, do_sample=False) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) # Output: The library serves 48 scientists and 85 technicians, along with many visiting staff and students. diff --git a/lfm/models/lfm2-2.6b-transcript.mdx b/lfm/models/lfm2-2.6b-transcript.mdx index c1afe91..83d86cc 100644 --- a/lfm/models/lfm2-2.6b-transcript.mdx +++ b/lfm/models/lfm2-2.6b-transcript.mdx @@ -121,8 +121,8 @@ Participants: Names (Roles) {"role": "user", "content": user_input} ] - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) - outputs = model.generate(inputs, max_new_tokens=512, temperature=0.3, do_sample=True) + inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device) + outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.3, do_sample=True) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) ``` diff --git a/lfm/models/lfm2-350m-enjp-mt.mdx b/lfm/models/lfm2-350m-enjp-mt.mdx index 3d5efd6..e8fa2b0 100644 --- a/lfm/models/lfm2-350m-enjp-mt.mdx +++ b/lfm/models/lfm2-350m-enjp-mt.mdx @@ -74,8 +74,8 @@ This model requires a specific system prompt to specify translation direction. S {"role": "user", "content": "What is C. elegans?"} ] - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) - outputs = model.generate(inputs, max_new_tokens=256) + inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device) + outputs = model.generate(**inputs, max_new_tokens=256) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) # Output: C. elegansとは何ですか? @@ -88,8 +88,8 @@ This model requires a specific system prompt to specify translation direction. S {"role": "user", "content": "今日は天気がいいですね。"} ] - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) - outputs = model.generate(inputs, max_new_tokens=256) + inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device) + outputs = model.generate(**inputs, max_new_tokens=256) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) # Output: The weather is nice today. diff --git a/lfm/models/lfm2-350m-extract.mdx b/lfm/models/lfm2-350m-extract.mdx index 9162c46..4a537e7 100644 --- a/lfm/models/lfm2-350m-extract.mdx +++ b/lfm/models/lfm2-350m-extract.mdx @@ -91,8 +91,8 @@ Schema: {"role": "user", "content": user_input} ] - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) - outputs = model.generate(inputs, max_new_tokens=256, temperature=0, do_sample=False) + inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device) + outputs = model.generate(**inputs, max_new_tokens=256, temperature=0, do_sample=False) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) ``` diff --git a/lfm/models/lfm2-350m-math.mdx b/lfm/models/lfm2-350m-math.mdx index 62db0e1..53aa6df 100644 --- a/lfm/models/lfm2-350m-math.mdx +++ b/lfm/models/lfm2-350m-math.mdx @@ -60,8 +60,8 @@ LFM2-350M-Math is a tiny reasoning model optimized for mathematical problem solv {"role": "user", "content": "If a train travels at 60 mph for 2.5 hours, how far does it travel?"} ] - inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) - outputs = model.generate(inputs, max_new_tokens=256) + inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to(model.device) + outputs = model.generate(**inputs, max_new_tokens=256) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) ``` diff --git a/lfm/models/lfm2-350m-pii-extract-jp.mdx b/lfm/models/lfm2-350m-pii-extract-jp.mdx index e24748c..c684ac4 100644 --- a/lfm/models/lfm2-350m-pii-extract-jp.mdx +++ b/lfm/models/lfm2-350m-pii-extract-jp.mdx @@ -83,8 +83,8 @@ Extract specific entities by listing only what you need (e.g., `Extract {line}{'\n'})} diff --git a/snippets/quickstart/vl-transformers.mdx b/snippets/quickstart/vl-transformers.mdx index 8dedb41..782c4f0 100644 --- a/snippets/quickstart/vl-transformers.mdx +++ b/snippets/quickstart/vl-transformers.mdx @@ -26,6 +26,9 @@ model = AutoModelForImageTextToText.from_pretrained( device_map="auto", dtype="bfloat16", ) +# IMPORTANT: tie lm_head to input embeddings (transformers v5 bug) +model.lm_head.weight = model.get_input_embeddings().weight + processor = AutoProcessor.from_pretrained(model_id) url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"