As you can guess from the title, this is the error I get. I only changed the model in AutoModelForCausalLM, Older version was
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
device_map ='auto',
torch_dtype = torch.float16,
use_auth_token = True)
However, since my GPU is NVIDIA GeForce RTX 2080 TI, it answers a simple question in 20 mins. Then I changed it to:
model = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7b-Chat-GGUF",
model_file = "llama-2-7b-chat.q4_K_M.gguf",
device_map ='auto',
torch_dtype = torch.float16,
use_auth_token = True)
However, this is not working, and giving the error. Below is the full code, if it is needed to solve.
Before the full code: Also, I have the file "llama-2-7b.Q5_K_m.gguf" downloaded from HF in my local env, but not virtual env. I am not using this local file in the code, but saying if it helps.
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
import os
import sys
import huggingface_hub
from huggingface_hub import notebook_login
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from torch import cuda, bfloat16
import chromadb
from pathlib import Path
from pprint import pprint
import json
from loader import JSONLoader
from langchain.prompts.chat import PromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate, ChatPromptTemplate
import json
from langchain.docstore.document import Document
def parse_json(json_data):
"""Parse JSON data into a Python dictionary."""
return json.loads(json_data)
def create_doc(json_data):
"""Create a Document object from JSON data."""
data = parse_json(json_data)
content_value = ""
# Collect values of keys that contain "item" in their name
for key, value in data.items():
if "item" in key.lower():
content_value += value + "\n"
return Document(page_content=content_value, metadata={"company": data["company"]})
##embed_model_id = 'BAAI/bge-base-en' ## CHANGE
embed_model_id = 'sentence-transformers/all-mpnet-base-v2'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu' ## NVIDIA GeForce RTX 2080 TI
embed_model = HuggingFaceEmbeddings(
model_name=embed_model_id,
model_kwargs={'device': device},
encode_kwargs={'device': device, 'batch_size': 32}
)
docs = []
for file in os.listdir("lessdata"):
if file.endswith(".json"):
file_path = "./lessdata/"+file
with open(file_path) as file:
json_data = file.read()
document = create_doc(json_data)
docs.append(document)
document_splitter = RecursiveCharacterTextSplitter(separators=['\n'], chunk_size = 500, chunk_overlap = 100)
document_chunks = document_splitter.split_documents(docs)
vectordb = Chroma.from_documents(document_chunks,embedding=embed_model, persist_directory='./database')
##vectordb.persist()
'''
vectordb = Chroma.from_documents(document_chunks,embedding=embed_model, persist_directory='./database')
vectordb.persist('./database')
'''
### PLEASE DO NOT TOUCH THE VSCODE
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_auth_token = True,)
model = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7b-Chat-GGUF",
model_file = "llama-2-7b-chat.q4_K_M.gguf",
device_map ='auto',
torch_dtype = torch.float16,
use_auth_token = True)
'''
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
device_map ='auto',
torch_dtype = torch.float16,
use_auth_token = True)
'''
pipe = pipeline("text-generation",
model = model,
tokenizer = tokenizer,
device_map='auto',
max_new_tokens = 512,
min_new_tokens = 1,
top_k = 5) ##see it
## In vectorstore, take top 5 closest vectors-inputs-contexts, whatever you wanna call.
llm = HuggingFacePipeline(pipeline=pipe, model_kwargs= {'temperature':0.7})
memory = ConversationBufferMemory(memory_key="chat_history", input_key='question', output_key='answer', return_messages=True)
system_template = r"""
Given a context, use your knowledge and answer the question. Be flexible, and try everything to answer in the format asked by query.
----
{context}
----
"""
user_template = "Question:```{question}```"
messages = [
SystemMessagePromptTemplate.from_template(system_template),
HumanMessagePromptTemplate.from_template(user_template)
]
qa_prompt = ChatPromptTemplate.from_messages(messages)
jsonExpert = ConversationalRetrievalChain.from_llm(llm = llm,
retriever=vectordb.as_retriever(search_kwargs = {'k': 1}), ## whats it
verbose = True, memory = memory, combine_docs_chain_kwargs={'prompt': qa_prompt},
return_source_documents = True
)
##retriever returns 1 output object.
chat_history = []
query = "Consider the financials and progress of companies who is in the tech business."
result = jsonExpert({"question": query}, {"chat_history": chat_history})
#result = jsonExpert({"question": query})
sources = result["source_documents"][0]
print(result['answer'])
pprint(sources)
pprint(memory)
Basically you have to convert your downloaded weights to Hugging Face Transformers format using this python src/transformers/models/llama/convert_llama_weights_to_hf.py \ --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
Then you would be able to get the config.json.
Source: https://huggingface.co/docs/transformers/main/en/model_doc/llama
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With