Open
Description
Library:
- install unsloth with
pip install unsloth
- (transformer == 4.50.0)
Code:
I'm running the code from the notebook"Llama3_(8B)-Ollama.ipynb"(https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb#scrollTo=R_Srz9REGK2u).
And the model I use below is downloaded from huggingface(https://huggingface.co/unsloth/Llama-3.2-1B-unsloth-bnb-4bit). I think there's no mistakes.
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
"unsloth/mistral-7b-v0.3-bnb-4bit", # New Mistral v3 2x faster!
"unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
"unsloth/llama-3-8b-bnb-4bit", # Llama-3 15 trillion tokens model 2x faster!
"unsloth/llama-3-8b-Instruct-bnb-4bit",
"unsloth/llama-3-70b-bnb-4bit",
"unsloth/Phi-3-mini-4k-instruct", # Phi-3 2x faster!
"unsloth/Phi-3-medium-4k-instruct",
"unsloth/mistral-7b-bnb-4bit",
"unsloth/gemma-7b-bnb-4bit", # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth
# model_file_path = '/data2/lss/.cache/huggingface/hub/models--unsloth--llama-3-8b-bnb-4bit'
# model_file_path = '/data2/lss/.cache/huggingface/Llama-3.2-3B-Instruct'
model_file_path = '/data2/lss/.cache/huggingface/hub/models--unsloth--Llama-3.2-1B-unsloth-bnb-4bit'
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = model_file_path,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
Actually all I've done is just to run the code in notebook(fine-tune llama3) in my own jupyter notebook, and I've build a new and clean conda environment for unsloth. I haven't added anything else, so I have no idea to handle this. Maybe I have to change another version of transformers?
Full Traceback:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[2], line 22
18 # model_file_path = '/data2/lss/.cache/huggingface/hub/models--unsloth--llama-3-8b-bnb-4bit'
19 # model_file_path = '/data2/lss/.cache/huggingface/Llama-3.2-3B-Instruct'
20 model_file_path = '/data2/lss/.cache/huggingface/hub/models--unsloth--Llama-3.2-1B-unsloth-bnb-4bit'
---> 22 model, tokenizer = FastLanguageModel.from_pretrained(
23 model_name = model_file_path,
24 max_seq_length = max_seq_length,
25 dtype = dtype,
26 load_in_4bit = load_in_4bit,
27 # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
28 )
File ~/install/anaconda3/envs/unsloth/lib/python3.10/site-packages/unsloth/models/loader.py:308, in FastLanguageModel.from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, load_in_8bit, full_finetuning, token, device_map, rope_scaling, fix_tokenizer, trust_remote_code, use_gradient_checkpointing, resize_model_vocab, revision, use_exact_model_name, fast_inference, gpu_memory_utilization, float8_kv_cache, random_state, max_lora_rank, disable_log_stats, *args, **kwargs)
300 dispatch_model = FastQwen2Model
301 # Temporary disable optimized Cohere until errors match
302 # elif model_type == "cohere":
303 # dispatch_model = FastCohereModel
(...)
306 # dispatch_model = FastGraniteModel
307 else:
--> 308 return FastModel.from_pretrained(
309 model_name = model_name,
310 max_seq_length = max_seq_length,
311 dtype = dtype,
312 load_in_4bit = load_in_4bit,
313 load_in_8bit = load_in_8bit,
314 full_finetuning = full_finetuning,
315 token = token,
316 device_map = device_map,
317 rope_scaling = rope_scaling, # [TODO] No effect
318 fix_tokenizer = fix_tokenizer, # [TODO] No effect
319 trust_remote_code = trust_remote_code,
320 use_gradient_checkpointing = use_gradient_checkpointing,
321 resize_model_vocab = resize_model_vocab, # [TODO] No effect
322 revision = revision,
323 return_logits = False, # Return logits
324 fullgraph = True, # No graph breaks
325 use_exact_model_name = use_exact_model_name,
326 *args, **kwargs,
327 )
328 pass
330 # Check if this is local model since the tokenizer gets overwritten
File ~/install/anaconda3/envs/unsloth/lib/python3.10/site-packages/unsloth/models/loader.py:666, in FastModel.from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, load_in_8bit, full_finetuning, token, device_map, rope_scaling, fix_tokenizer, trust_remote_code, use_gradient_checkpointing, resize_model_vocab, revision, return_logits, fullgraph, use_exact_model_name, *args, **kwargs)
664 with redirector:
665 patch_loss_functions(torch_compile = False)
--> 666 model_types = unsloth_compile_transformers(
667 dtype = dtype,
668 model_name = model_name,
669 model_types = model_types,
670 token = token,
671 sdpa_dynamic_mask = True,
672 sdpa_bool_masks = True,
673 sdpa_gqa_replace = True,
674 sdpa_dynamic_compile = True,
675 compile_attention = True,
676 disable_causal_masks = True,
677 compile_torch_modules = True,
678 compile_custom_modules = True,
679 compile_function_calls = True,
680 fuse_lm_head = True,
681 gradient_checkpointing = True,
682 manual_replacements = True,
683 fast_lora_forwards = True,
684 fast_residual_stream = False,
685 accurate_accumulation = True,
686 epilogue_fusion = True,
687 max_autotune = False,
688 shape_padding = True,
689 cudagraphs = False,
690 debug = False,
691 fullgraph = fullgraph,
692 import_from_cache = False,
693 disable = False,
694 return_logits = return_logits,
695 trust_remote_code = trust_remote_code,
696 )
697 pass
699 # Check if this is local model since the tokenizer gets overwritten
File ~/install/anaconda3/envs/unsloth/lib/python3.10/site-packages/unsloth/models/_utils.py:1183, in unsloth_compile_transformers(dtype, model_name, model_types, token, revision, trust_remote_code, sdpa_dynamic_mask, sdpa_bool_masks, sdpa_gqa_replace, sdpa_dynamic_compile, compile_attention, disable_causal_masks, compile_torch_modules, compile_custom_modules, compile_function_calls, fuse_lm_head, gradient_checkpointing, manual_replacements, fast_lora_forwards, fast_residual_stream, accurate_accumulation, epilogue_fusion, max_autotune, shape_padding, cudagraphs, debug, fullgraph, import_from_cache, disable, return_logits)
1181 model_types = list(dict().fromkeys(model_types).keys())
1182 for model_type in model_types:
-> 1183 _unsloth_compile_transformers(
1184 model_type,
1185 sdpa_dynamic_mask = sdpa_dynamic_mask,
1186 sdpa_bool_masks = sdpa_bool_masks,
1187 sdpa_gqa_replace = sdpa_gqa_replace,
1188 sdpa_dynamic_compile = sdpa_dynamic_compile,
1189 compile_attention = compile_attention,
1190 disable_causal_masks = disable_causal_masks,
1191 compile_torch_modules = compile_torch_modules,
1192 compile_custom_modules = compile_custom_modules,
1193 compile_function_calls = compile_function_calls,
1194 fuse_lm_head = fuse_lm_head,
1195 gradient_checkpointing = gradient_checkpointing,
1196 manual_replacements = manual_replacements,
1197 fast_lora_forwards = fast_lora_forwards,
1198 fast_residual_stream = fast_residual_stream,
1199 accurate_accumulation = accurate_accumulation,
1200 epilogue_fusion = epilogue_fusion,
1201 max_autotune = max_autotune,
1202 shape_padding = shape_padding,
1203 cudagraphs = cudagraphs,
1204 debug = debug,
1205 fullgraph = fullgraph,
1206 import_from_cache = import_from_cache,
1207 disable = disable,
1208 return_logits = return_logits,
1209 )
1210 pass
1211 # Redo patches which override compiler
File ~/install/anaconda3/envs/unsloth/lib/python3.10/site-packages/unsloth_zoo/compiler.py:1635, in unsloth_compile_transformers(model_type, sdpa_dynamic_mask, sdpa_bool_masks, sdpa_gqa_replace, sdpa_dynamic_compile, compile_attention, disable_causal_masks, compile_torch_modules, compile_custom_modules, compile_function_calls, fuse_lm_head, gradient_checkpointing, manual_replacements, fast_lora_forwards, fast_residual_stream, accurate_accumulation, epilogue_fusion, max_autotune, shape_padding, cudagraphs, debug, fullgraph, import_from_cache, disable, return_logits)
1633 if disable_causal_masks:
1634 for module in other_classes:
-> 1635 source = eval(f"{model_location}.{module}")
1636 if not hasattr(source, "_update_causal_mask"): continue
1638 try: source = inspect.getsource(source.__init__)
File <string>:1
AttributeError: module 'transformers.models.bit.modeling_bit' has no attribute 'Linear'
Metadata
Metadata
Assignees
Labels
No labels