import fitz import os from openai import OpenAI from dotenv import load_dotenv from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import FAISS load_dotenv() api_key = os.getenv("OPENAI_API_KEY") client = OpenAI(api_key=api_key) def extract_text_from_pdf(pdf_path): text = "" with fitz.open(pdf_path) as doc: for page in doc: text += page.get_text("text") + "\n" return text def criarChunk(texto): text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50, length_function=len ) return text_splitter.split_text(texto) def create_faiss_index(chunks, embeddings): vector_store = FAISS.from_texts(chunks, embeddings) # Criar índice FAISS return vector_store def search_faiss(vector_store, query, embeddings, top_k=3): query_embedding = embeddings.embed_query(query) # Gerar embedding da pergunta docs = vector_store.similarity_search(query, k=top_k) # Procurar no FAISS return docs def debug_embeddings(chunks, embeddings): embeddings_list = embeddings.embed_documents(chunks) print(f"\n DEBUG: Embeddings Gerados") print(f"Número total de chunks: {len(chunks)}") print(f"Número total de embeddings: {len(embeddings_list)}") if embeddings_list: print(f"Tamanho do primeiro embedding: {len(embeddings_list[0])}") print("\n Exemplo de Chunk e seu Embedding:") print(f"Chunk: {chunks[0]}") print(f"Embedding (primeiros 10 valores): {embeddings_list[0][:10]}") def debug_faiss(vector_store, query, embeddings, top_k=3): query_embedding = embeddings.embed_query(query) print(f"\n DEBUG: Tamanho do vetor da pergunta: {len(query_embedding)}") docs = vector_store.similarity_search(query, k=top_k) print("\n DEBUG: Resultados da busca FAISS") print(f"Número de chunks retornados: {len(docs)}") for i, doc in enumerate(docs): print(f"\n Chunk {i+1}:") print(doc.page_content[:200]) # Mostra os primeiros 200 caracteres do chunk def generate_response(query, vector_store, embeddings): docs = search_faiss(vector_store, query, embeddings) context = "\n".join([doc.page_content for doc in docs]) response = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Use o contexto abaixo para responder."}, {"role": "system", "content": context}, {"role": "user", "content": query} ] ) return response.choices[0].message.content pdf_file = "teste.pdf" texto_extraido = extract_text_from_pdf(pdf_file) chunks = criarChunk(texto_extraido) embeddings = OpenAIEmbeddings() debug_embeddings(chunks, embeddings) vector_store = create_faiss_index(chunks, embeddings) debug_faiss(vector_store, "Exemplo de pesquisa", embeddings) print("Chatbot: Olá! Como te posso ajudar?") while True: user_input = input("Você: ") if user_input.lower() in ["sair", "exit", "quit"]: print("Chatbot: Até logo!") break debug_faiss(vector_store, user_input, embeddings) resposta = generate_response(user_input, vector_store, embeddings) print("Chatbot:", resposta)