98 lines
3.2 KiB
Python
98 lines
3.2 KiB
Python
import fitz
|
|
import os
|
|
from openai import OpenAI
|
|
from dotenv import load_dotenv
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_openai import OpenAIEmbeddings
|
|
from langchain_community.vectorstores import FAISS
|
|
|
|
load_dotenv()
|
|
api_key = os.getenv("OPENAI_API_KEY")
|
|
client = OpenAI(api_key=api_key)
|
|
|
|
def extract_text_from_pdf(pdf_path):
|
|
text = ""
|
|
with fitz.open(pdf_path) as doc:
|
|
for page in doc:
|
|
text += page.get_text("text") + "\n"
|
|
return text
|
|
|
|
def criarChunk(texto):
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=500,
|
|
chunk_overlap=50,
|
|
length_function=len
|
|
)
|
|
return text_splitter.split_text(texto)
|
|
|
|
def create_faiss_index(chunks, embeddings):
|
|
vector_store = FAISS.from_texts(chunks, embeddings) # Criar índice FAISS
|
|
return vector_store
|
|
|
|
def search_faiss(vector_store, query, embeddings, top_k=3):
|
|
query_embedding = embeddings.embed_query(query) # Gerar embedding da pergunta
|
|
docs = vector_store.similarity_search(query, k=top_k) # Procurar no FAISS
|
|
return docs
|
|
|
|
def debug_embeddings(chunks, embeddings):
|
|
embeddings_list = embeddings.embed_documents(chunks)
|
|
|
|
print(f"\n DEBUG: Embeddings Gerados")
|
|
print(f"Número total de chunks: {len(chunks)}")
|
|
print(f"Número total de embeddings: {len(embeddings_list)}")
|
|
|
|
if embeddings_list:
|
|
print(f"Tamanho do primeiro embedding: {len(embeddings_list[0])}")
|
|
|
|
print("\n Exemplo de Chunk e seu Embedding:")
|
|
print(f"Chunk: {chunks[0]}")
|
|
print(f"Embedding (primeiros 10 valores): {embeddings_list[0][:10]}")
|
|
|
|
def debug_faiss(vector_store, query, embeddings, top_k=3):
|
|
query_embedding = embeddings.embed_query(query)
|
|
print(f"\n DEBUG: Tamanho do vetor da pergunta: {len(query_embedding)}")
|
|
|
|
docs = vector_store.similarity_search(query, k=top_k)
|
|
print("\n DEBUG: Resultados da busca FAISS")
|
|
print(f"Número de chunks retornados: {len(docs)}")
|
|
|
|
for i, doc in enumerate(docs):
|
|
print(f"\n Chunk {i+1}:")
|
|
print(doc.page_content[:200]) # Mostra os primeiros 200 caracteres do chunk
|
|
|
|
def generate_response(query, vector_store, embeddings):
|
|
docs = search_faiss(vector_store, query, embeddings)
|
|
context = "\n".join([doc.page_content for doc in docs])
|
|
|
|
response = client.chat.completions.create(
|
|
model="gpt-3.5-turbo",
|
|
messages=[
|
|
{"role": "system", "content": "Use o contexto abaixo para responder."},
|
|
{"role": "system", "content": context},
|
|
{"role": "user", "content": query}
|
|
]
|
|
)
|
|
return response.choices[0].message.content
|
|
|
|
pdf_file = "teste.pdf"
|
|
texto_extraido = extract_text_from_pdf(pdf_file)
|
|
chunks = criarChunk(texto_extraido)
|
|
|
|
embeddings = OpenAIEmbeddings()
|
|
|
|
debug_embeddings(chunks, embeddings)
|
|
|
|
vector_store = create_faiss_index(chunks, embeddings)
|
|
debug_faiss(vector_store, "Exemplo de pesquisa", embeddings)
|
|
|
|
print("Chatbot: Olá! Como te posso ajudar?")
|
|
while True:
|
|
user_input = input("Você: ")
|
|
if user_input.lower() in ["sair", "exit", "quit"]:
|
|
print("Chatbot: Até logo!")
|
|
break
|
|
|
|
debug_faiss(vector_store, user_input, embeddings)
|
|
resposta = generate_response(user_input, vector_store, embeddings)
|
|
print("Chatbot:", resposta)
|