2025-05-12 18:24:27 +01:00

98 lines
3.2 KiB
Python

import fitz
import os
from openai import OpenAI
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)
def extract_text_from_pdf(pdf_path):
text = ""
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text("text") + "\n"
return text
def criarChunk(texto):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
length_function=len
)
return text_splitter.split_text(texto)
def create_faiss_index(chunks, embeddings):
vector_store = FAISS.from_texts(chunks, embeddings) # Criar índice FAISS
return vector_store
def search_faiss(vector_store, query, embeddings, top_k=3):
query_embedding = embeddings.embed_query(query) # Gerar embedding da pergunta
docs = vector_store.similarity_search(query, k=top_k) # Procurar no FAISS
return docs
def debug_embeddings(chunks, embeddings):
embeddings_list = embeddings.embed_documents(chunks)
print(f"\n DEBUG: Embeddings Gerados")
print(f"Número total de chunks: {len(chunks)}")
print(f"Número total de embeddings: {len(embeddings_list)}")
if embeddings_list:
print(f"Tamanho do primeiro embedding: {len(embeddings_list[0])}")
print("\n Exemplo de Chunk e seu Embedding:")
print(f"Chunk: {chunks[0]}")
print(f"Embedding (primeiros 10 valores): {embeddings_list[0][:10]}")
def debug_faiss(vector_store, query, embeddings, top_k=3):
query_embedding = embeddings.embed_query(query)
print(f"\n DEBUG: Tamanho do vetor da pergunta: {len(query_embedding)}")
docs = vector_store.similarity_search(query, k=top_k)
print("\n DEBUG: Resultados da busca FAISS")
print(f"Número de chunks retornados: {len(docs)}")
for i, doc in enumerate(docs):
print(f"\n Chunk {i+1}:")
print(doc.page_content[:200]) # Mostra os primeiros 200 caracteres do chunk
def generate_response(query, vector_store, embeddings):
docs = search_faiss(vector_store, query, embeddings)
context = "\n".join([doc.page_content for doc in docs])
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Use o contexto abaixo para responder."},
{"role": "system", "content": context},
{"role": "user", "content": query}
]
)
return response.choices[0].message.content
pdf_file = "teste.pdf"
texto_extraido = extract_text_from_pdf(pdf_file)
chunks = criarChunk(texto_extraido)
embeddings = OpenAIEmbeddings()
debug_embeddings(chunks, embeddings)
vector_store = create_faiss_index(chunks, embeddings)
debug_faiss(vector_store, "Exemplo de pesquisa", embeddings)
print("Chatbot: Olá! Como te posso ajudar?")
while True:
user_input = input("Você: ")
if user_input.lower() in ["sair", "exit", "quit"]:
print("Chatbot: Até logo!")
break
debug_faiss(vector_store, user_input, embeddings)
resposta = generate_response(user_input, vector_store, embeddings)
print("Chatbot:", resposta)