Skip to content

Commit 7dbbaf3

Browse files
MAke the GGUF and converter more better way..
1 parent 51a3938 commit 7dbbaf3

File tree

3 files changed

+46
-194
lines changed

3 files changed

+46
-194
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ __pycache__/
88
*.dylib
99

1010
upcoming.md
11+
GEMINI.md
1112
examples/
1213
*.ipynb
1314

quantllm/hub/hub_manager.py

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -34,34 +34,30 @@ def push_model(
3434
):
3535
"""Push model and tokenizer to HuggingFace Hub."""
3636
try:
37+
# Ensure the repository exists
3738
if not self.api.repo_exists(self.model_id):
3839
self.api.create_repo(
3940
repo_id=self.model_id,
4041
token=self.token,
4142
organization=self.organization
4243
)
4344
logger.log_success(f"Created new repository: {self.model_id}")
45+
46+
# Save model and tokenizer to a temporary directory and upload
47+
with tempfile.TemporaryDirectory() as temp_dir:
48+
model.save_pretrained(temp_dir)
49+
tokenizer.save_pretrained(temp_dir)
4450

45-
# Push model
46-
model.push_to_hub(
47-
self.model_id,
48-
token=self.token,
49-
commit_message=commit_message,
50-
**kwargs
51-
)
52-
logger.log_success(f"Successfully pushed model to {self.model_id}")
51+
self.push_folder(
52+
folder_path=temp_dir,
53+
commit_message=commit_message,
54+
**kwargs
55+
)
5356

54-
# Push tokenizer
55-
tokenizer.push_to_hub(
56-
self.model_id,
57-
token=self.token,
58-
commit_message=commit_message,
59-
**kwargs
60-
)
61-
logger.log_success(f"Successfully pushed tokenizer to {self.model_id}")
57+
logger.log_success(f"Successfully pushed model and tokenizer to {self.model_id}")
6258

6359
except Exception as e:
64-
logger.log_error(f"Error pushing to hub: {str(e)}")
60+
logger.log_error(f"Error pushing model to hub: {str(e)}")
6561
raise
6662

6763
def push_folder(

quantllm/quant/gguf.py

Lines changed: 32 additions & 177 deletions
Original file line numberDiff line numberDiff line change
@@ -298,193 +298,48 @@ def _log_model_stats(self, model: PreTrainedModel, stage: str = ""):
298298
logger.log_info(f"GPU Memory Reserved: {torch.cuda.memory_reserved() / (1024 * 1024):.2f} MB")
299299

300300
def convert_to_gguf(self, output_path: str):
301-
"""Convert model to GGUF format with separate quantization step."""
301+
"""
302+
Convert model to GGUF format using the LlamaCppConverter for robustness.
303+
"""
304+
from .llama_cpp_utils import LlamaCppConverter
305+
302306
if not CT_AVAILABLE:
303-
raise ImportError("CTransformers is required for GGUF conversion")
304-
305-
temp_dir = None
306-
temp_gguf = None
307+
raise ImportError("CTransformers is required for GGUF conversion. Install with: pip install ctransformers")
308+
307309
try:
308310
logger.log_info("\n" + "="*80)
309-
logger.log_info("🚀 Starting GGUF Conversion Process".center(80))
311+
logger.log_info("🚀 Starting GGUF Conversion Process via LlamaCppConverter".center(80))
310312
logger.log_info("="*80 + "\n")
313+
314+
output_dir = os.path.dirname(output_path)
315+
custom_name = os.path.basename(output_path)
316+
317+
converter = LlamaCppConverter(verbose=True)
311318

312-
# Model Information
313-
logger.log_info("📊 Model Information:")
314-
logger.log_info("-"*40)
315-
model_type = self.model.config.model_type if hasattr(self.model, 'config') else None
316-
supported_types = ["llama", "mistral", "falcon", "mpt", "gpt_neox", "pythia", "stablelm"]
317-
318-
if model_type in supported_types:
319-
logger.log_info(f"• Architecture: {model_type.upper()}")
320-
else:
321-
logger.log_info(f"• Architecture: Unknown (using default LLAMA)")
322-
model_type = "llama"
323-
324-
total_params = sum(p.numel() for p in self.model.parameters())
325-
logger.log_info(f"• Total Parameters: {total_params:,}")
326-
model_size = sum(p.numel() * p.element_size() for p in self.model.parameters()) / (1024**3)
327-
logger.log_info(f"• Model Size: {model_size:.2f} GB")
328-
logger.log_info("")
329-
330-
# Conversion Settings
331-
logger.log_info("⚙️ Conversion Settings:")
332-
logger.log_info("-"*40)
333-
logger.log_info(f"• Output Path: {output_path}")
334-
logger.log_info(f"• Quantization Type: {self.quant_type}")
335-
logger.log_info(f"• Target Bits: {self.bits}")
336-
logger.log_info(f"• Group Size: {self.group_size}")
337-
logger.log_info("")
338-
339-
# Save temporary checkpoint
340-
temp_dir = f"{output_path}_temp_hf"
341-
logger.log_info("💾 Saving Temporary Checkpoint:")
342-
logger.log_info("-"*40)
343-
logger.log_info(f"• Checkpoint Path: {temp_dir}")
344-
self.model.save_pretrained(temp_dir, safe_serialization=True)
345-
logger.log_info("• Checkpoint saved successfully")
346-
logger.log_info("")
347-
348-
# Find llama.cpp tools
349-
logger.log_info("🔍 Locating GGUF Conversion Tools:")
350-
logger.log_info("-"*40)
351-
352-
try:
353-
import llama_cpp
354-
llama_cpp_path = os.path.dirname(llama_cpp.__file__)
355-
convert_script = os.path.join(llama_cpp_path, "convert.py")
356-
quantize_bin = os.path.join(llama_cpp_path, "quantize")
357-
if not os.path.exists(convert_script):
358-
raise FileNotFoundError("convert.py not found")
359-
if not os.path.exists(quantize_bin):
360-
raise FileNotFoundError("quantize binary not found")
361-
logger.log_info(f"• Found convert.py: {convert_script}")
362-
logger.log_info(f"• Found quantize: {quantize_bin}")
363-
except (ImportError, FileNotFoundError) as e:
364-
logger.log_error(f"• Failed to locate llama.cpp tools: {e}")
365-
try:
366-
logger.log_info("• Attempting to install llama-cpp-python...")
367-
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "llama-cpp-python"])
368-
import llama_cpp
369-
llama_cpp_path = os.path.dirname(llama_cpp.__file__)
370-
convert_script = os.path.join(llama_cpp_path, "convert.py")
371-
quantize_bin = os.path.join(llama_cpp_path, "quantize")
372-
logger.log_info("• Successfully installed and located tools")
373-
except Exception as inst_err:
374-
raise RuntimeError(
375-
f"Could not find or install llama-cpp-python: {inst_err}\n"
376-
"Install manually: pip install llama-cpp-python --upgrade"
377-
) from e
378-
379-
# Convert to FP16 GGUF
380-
logger.log_info("🛠️ Converting to FP16 GGUF:")
381-
logger.log_info("-"*40)
382-
temp_gguf = f"{output_path}_temp_f16.gguf"
383-
cmd_convert = [
384-
sys.executable,
385-
convert_script,
386-
temp_dir,
387-
"--outfile", temp_gguf,
388-
"--outtype", "f16",
389-
"--model-type", model_type
390-
]
391-
392-
logger.log_info(f"• Command: {' '.join(cmd_convert)}")
393-
with tqdm(total=100, desc="Converting to FP16", unit="%") as pbar:
394-
process = subprocess.Popen(
395-
cmd_convert,
396-
stdout=subprocess.PIPE,
397-
stderr=subprocess.PIPE,
398-
universal_newlines=True
399-
)
400-
401-
while True:
402-
output = process.stdout.readline()
403-
if output == '' and process.poll() is not None:
404-
break
405-
if output and "Converting" in output:
406-
try:
407-
progress = int(output.split("%")[0].split()[-1])
408-
pbar.n = progress
409-
pbar.refresh()
410-
except:
411-
pass
412-
logger.log_info(f"• {output.strip()}")
413-
414-
return_code = process.wait()
415-
if return_code != 0:
416-
error_output = process.stderr.read()
417-
raise RuntimeError(f"FP16 GGUF conversion failed:\n{error_output}")
418-
419-
# Quantize to target type
420-
logger.log_info("\n🔄 Quantizing GGUF:")
421-
logger.log_info("-"*40)
422-
cmd_quantize = [
423-
quantize_bin,
424-
temp_gguf,
425-
output_path,
426-
self.quant_type.lower() # llama.cpp expects lowercase
427-
]
428-
429-
logger.log_info(f"• Command: {' '.join(cmd_quantize)}")
430-
with tqdm(total=100, desc="Quantizing GGUF", unit="%") as pbar:
431-
process = subprocess.Popen(
432-
cmd_quantize,
433-
stdout=subprocess.PIPE,
434-
stderr=subprocess.PIPE,
435-
universal_newlines=True
436-
)
437-
438-
while True:
439-
output = process.stdout.readline()
440-
if output == '' and process.poll() is not None:
441-
break
442-
if output and "%" in output:
443-
try:
444-
progress = int(output.split("%")[0].split()[-1])
445-
pbar.n = progress
446-
pbar.refresh()
447-
except:
448-
pass
449-
logger.log_info(f"• {output.strip()}")
450-
451-
return_code = process.wait()
452-
if return_code != 0:
453-
error_output = process.stderr.read()
454-
raise RuntimeError(f"GGUF quantization failed:\n{error_output}")
455-
456-
# Verify results
457-
if os.path.exists(output_path):
458-
logger.log_info("\n✅ Conversion Results:")
459-
logger.log_info("-"*40)
460-
461-
file_size = os.path.getsize(output_path) / (1024**3)
462-
logger.log_info(f"• GGUF File Size: {file_size:.2f} GB")
463-
464-
compression_ratio = model_size / file_size
465-
logger.log_info(f"• Compression Ratio: {compression_ratio:.2f}x")
466-
logger.log_info(f"• Output Path: {output_path}")
467-
468-
logger.log_info("\n" + "="*80)
469-
logger.log_info("✨ GGUF Conversion Completed Successfully! ✨".center(80))
470-
logger.log_info("="*80 + "\n")
471-
else:
472-
raise RuntimeError(f"GGUF file was not created at {output_path}")
473-
319+
gguf_path = converter.convert_to_gguf(
320+
model=self.model,
321+
output_dir=output_dir,
322+
bits=self.bits,
323+
group_size=self.group_size,
324+
save_tokenizer=True, # It's good practice to save the tokenizer
325+
custom_name=custom_name,
326+
quant_type=self.quant_type
327+
)
328+
329+
if not os.path.exists(gguf_path):
330+
raise RuntimeError(f"GGUF file was not created at {gguf_path}")
331+
332+
logger.log_info("\n" + "="*80)
333+
logger.log_info("✨ GGUF Conversion Completed Successfully! ✨".center(80))
334+
logger.log_info(f"📄 GGUF file saved to: {gguf_path}".center(80))
335+
logger.log_info("="*80 + "\n")
336+
474337
except Exception as e:
475-
logger.log_error("\n❌ Conversion Failed:")
338+
logger.log_error("\nGGUF Conversion Failed:")
476339
logger.log_error("-"*40)
477340
logger.log_error(f"• Error: {str(e)}")
478341
raise RuntimeError(f"Failed to convert model to GGUF: {str(e)}") from e
479-
480342
finally:
481-
if temp_dir and os.path.exists(temp_dir):
482-
logger.log_info("\n🧹 Cleaning Up:")
483-
logger.log_info("-"*40)
484-
logger.log_info("• Removing temporary files...")
485-
shutil.rmtree(temp_dir, ignore_errors=True)
486-
if temp_gguf and os.path.exists(temp_gguf):
487-
os.remove(temp_gguf)
488343
self._clear_memory()
489344

490345
def _clear_memory(self):

0 commit comments

Comments
 (0)