从零开始的RAG:查询转换
查询转换是一组专注于重写和/或修改检索问题的方法。
环境
(1) 包
1 ! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain
(2) LangSmith
https://docs.smith.langchain.com/
1 2 3 4 import osos.environ['LANGCHAIN_TRACING_V2' ] = 'true' os.environ['LANGCHAIN_ENDPOINT' ] = 'https://api.smith.langchain.com' os.environ['LANGCHAIN_API_KEY' ] = <your-api-key>
(3) API 密钥
1 os.environ['OPENAI_API_KEY' ] = <your-api-key>
第5部分:多查询
流程:
文档:
索引
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 import bs4from langchain_community.document_loaders import WebBaseLoaderloader = WebBaseLoader( web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/" ,), bs_kwargs=dict ( parse_only=bs4.SoupStrainer( class_=("post-content" , "post-title" , "post-header" ) ) ), ) blog_docs = loader.load() from langchain.text_splitter import RecursiveCharacterTextSplittertext_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( chunk_size=300 , chunk_overlap=50 ) splits = text_splitter.split_documents(blog_docs) from langchain_openai import OpenAIEmbeddingsfrom langchain_community.vectorstores import Chromavectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings()) retriever = vectorstore.as_retriever()
提示
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 from langchain.prompts import ChatPromptTemplatetemplate = """你是一个AI语言模型助手。你的任务是生成五个不同版本的用户问题,以从向量数据库中检索相关文档。通过对用户问题生成多个视角,你的目标是帮助用户克服基于距离的相似性搜索的一些限制。请提供这些替代问题,并用换行符分隔。原始问题:{question}""" prompt_perspectives = ChatPromptTemplate.from_template(template) from langchain_core.output_parsers import StrOutputParserfrom langchain_openai import ChatOpenAIgenerate_queries = ( prompt_perspectives | ChatOpenAI(temperature=0 ) | StrOutputParser() | (lambda x: x.split("\n" )) )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 from langchain.load import dumps, loadsdef get_unique_union (documents: list [list ] ): """ 检索文档的唯一并集 """ flattened_docs = [dumps(doc) for sublist in documents for doc in sublist] unique_docs = list (set (flattened_docs)) return [loads(doc) for doc in unique_docs] question = "What is task decomposition for LLM agents?" retrieval_chain = generate_queries | retriever.map () | get_unique_union docs = retrieval_chain.invoke({"question" :question}) len (docs)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 from operator import itemgetterfrom langchain_openai import ChatOpenAIfrom langchain_core.runnables import RunnablePassthroughtemplate = """根据以下上下文回答问题: {context} 问题:{question} """ prompt = ChatPromptTemplate.from_template(template) llm = ChatOpenAI(temperature=0 ) final_rag_chain = ( {"context" : retrieval_chain, "question" : itemgetter("question" )} | prompt | llm | StrOutputParser() ) final_rag_chain.invoke({"question" :question})
第6部分:RAG-Fusion
流程:
文档:
博客/仓库:
提示
1 2 3 4 5 6 7 from langchain.prompts import ChatPromptTemplatetemplate = """你是一个有帮助的助手,可以根据单个输入查询生成多个搜索查询。\n 生成与以下内容相关的多个搜索查询:{question} \n 输出(4个查询):""" prompt_rag_fusion = ChatPromptTemplate.from_template(template)
1 2 3 4 5 6 7 8 9 from langchain_core.output_parsers import StrOutputParserfrom langchain_openai import ChatOpenAIgenerate_queries = ( prompt_rag_fusion | ChatOpenAI(temperature=0 ) | StrOutputParser() | (lambda x: x.split("\n" )) )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 from langchain.load import dumps, loadsdef reciprocal_rank_fusion (results: list [list ], k=60 ): """ 互惠排名融合,接受多个排名文档列表和一个可选参数k用于RRF公式 """ fused_scores = {} for docs in results: for rank, doc in enumerate (docs): doc_str = dumps(doc) if doc_str not in fused_scores: fused_scores[doc_str] = 0 previous_score = fused_scores[doc_str] fused_scores[doc_str] += 1 / (rank + k) reranked_results = [ (loads(doc), score) for doc, score in sorted (fused_scores.items(), key=lambda x: x[1 ], reverse=True ) ] return reranked_results retrieval_chain_rag_fusion = generate_queries | retriever.map () | reciprocal_rank_fusion docs = retrieval_chain_rag_fusion.invoke({"question" : question}) len (docs)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 from langchain_core.runnables import RunnablePassthroughtemplate = """根据以下上下文回答问题: {context} 问题:{question} """ prompt = ChatPromptTemplate.from_template(template) final_rag_chain = ( {"context" : retrieval_chain_rag_fusion, "question" : itemgetter("question" )} | prompt | llm | StrOutputParser() ) final_rag_chain.invoke({"question" :question})
追踪:
https://smith.langchain.com/public/071202c9-9f4d-41b1-bf9d-86b7c5a7525b/r
第7部分:分解
1 2 3 4 5 6 7 8 from langchain.prompts import ChatPromptTemplatetemplate = """你是一个有帮助的助手,可以生成与输入问题相关的多个子问题。\n 目标是将输入分解为一组可以单独回答的子问题/子问题。\n 生成与以下内容相关的多个搜索查询:{question} \n 输出(3个查询):""" prompt_decomposition = ChatPromptTemplate.from_template(template)
1 2 3 4 5 6 7 8 9 10 11 12 from langchain_openai import ChatOpenAIfrom langchain_core.output_parsers import StrOutputParserllm = ChatOpenAI(temperature=0 ) generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n" ))) question = "What are the main components of an LLM-powered autonomous agent system?" questions = generate_queries_decomposition.invoke({"question" :question})
递归回答
论文:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 template = """这是你需要回答的问题: \n --- \n {question} \n --- \n 这是任何可用的背景问题+答案对: \n --- \n {q_a_pairs} \n --- \n 这是与问题相关的额外上下文: \n --- \n {context} \n --- \n 使用上述上下文和任何背景问题+答案对来回答问题:\n {question} """ decomposition_prompt = ChatPromptTemplate.from_template(template)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 from operator import itemgetterfrom langchain_core.output_parsers import StrOutputParserdef format_qa_pair (question, answer ): """格式化问答对""" formatted_string = "" formatted_string += f"问题:{question} \n答案:{answer} \n\n" return formatted_string.strip() llm = ChatOpenAI(model_name="gpt-3.5-turbo" , temperature=0 ) q_a_pairs = "" for q in questions: rag_chain = ( {"context" : itemgetter("question" ) | retriever, "question" : itemgetter("question" ), "q_a_pairs" : itemgetter("q_a_pairs" )} | decomposition_prompt | llm | StrOutputParser()) answer = rag_chain.invoke({"question" :q,"q_a_pairs" :q_a_pairs}) q_a_pair = format_qa_pair(q,answer) q_a_pairs = q_a_pairs + "\n---\n" + q_a_pair
单独回答
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 from langchain import hubfrom langchain_core.prompts import ChatPromptTemplatefrom langchain_core.runnables import RunnablePassthrough, RunnableLambdafrom langchain_core.output_parsers import StrOutputParserfrom langchain_openai import ChatOpenAIprompt_rag = hub.pull("rlm/rag-prompt" ) def retrieve_and_rag (question,prompt_rag,sub_question_generator_chain ): """对每个子问题进行RAG""" sub_questions = sub_question_generator_chain.invoke({"question" :question}) rag_results = [] for sub_question in sub_questions: retrieved_docs = retriever.get_relevant_documents(sub_question) answer = (prompt_rag | llm | StrOutputParser()).invoke({"context" : retrieved_docs, "question" : sub_question}) rag_results.append(answer) return rag_results,sub_questions answers, questions = retrieve_and_rag(question, prompt_rag, generate_queries_decomposition)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 def format_qa_pairs (questions, answers ): """格式化问答对""" formatted_string = "" for i, (question, answer) in enumerate (zip (questions, answers), start=1 ): formatted_string += f"问题 {i} :{question} \n答案 {i} :{answer} \n\n" return formatted_string.strip() context = format_qa_pairs(questions, answers) template = """这是一组问答对: {context} 使用这些来综合回答问题:{question} """ prompt = ChatPromptTemplate.from_template(template) final_rag_chain = ( prompt | llm | StrOutputParser() ) final_rag_chain.invoke({"context" :context,"question" :question})
第8部分:退一步
论文:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplateexamples = [ { "input" : "Could the members of The Police perform lawful arrests?" , "output" : "what can the members of The Police do?" , }, { "input" : "Jan Sindel’s was born in what country?" , "output" : "what is Jan Sindel’s personal history?" , }, ] example_prompt = ChatPromptTemplate.from_messages( [ ("human" , "{input}" ), ("ai" , "{output}" ), ] ) few_shot_prompt = FewShotChatMessagePromptTemplate( example_prompt=example_prompt, examples=examples, ) prompt = ChatPromptTemplate.from_messages( [ ( "system" , """你是世界知识的专家。你的任务是退一步,将问题改写为更通用的退一步问题,这样更容易回答。以下是一些示例:""" , ), few_shot_prompt, ("user" , "{question}" ), ] )
1 2 3 generate_queries_step_back = prompt | ChatOpenAI(temperature=0 ) | StrOutputParser() question = "What is task decomposition for LLM agents?" generate_queries_step_back.invoke({"question" : question})
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 response_prompt_template = """你是世界知识的专家。我将向你提问。你的回答应该是全面的,并且如果相关,不应与以下上下文相矛盾。否则,如果它们不相关,请忽略它们。 # {normal_context} # {step_back_context} # 原始问题:{question} # 答案:""" response_prompt = ChatPromptTemplate.from_template(response_prompt_template) chain = ( { "normal_context" : RunnableLambda(lambda x: x["question" ]) | retriever, "step_back_context" : generate_queries_step_back | retriever, "question" : lambda x: x["question" ], } | response_prompt | ChatOpenAI(temperature=0 ) | StrOutputParser() ) chain.invoke({"question" : question})
第9部分:HyDE
文档:
论文:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 from langchain.prompts import ChatPromptTemplatetemplate = """请撰写一篇科学论文段落来回答问题 问题:{question} 段落:""" prompt_hyde = ChatPromptTemplate.from_template(template) from langchain_core.output_parsers import StrOutputParserfrom langchain_openai import ChatOpenAIgenerate_docs_for_retrieval = ( prompt_hyde | ChatOpenAI(temperature=0 ) | StrOutputParser() ) question = "What is task decomposition for LLM agents?" generate_docs_for_retrieval.invoke({"question" :question})
1 2 3 4 retrieval_chain = generate_docs_for_retrieval | retriever retireved_docs = retrieval_chain.invoke({"question" :question}) retireved_docs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 template = """根据以下上下文回答问题: {context} 问题:{question} """ prompt = ChatPromptTemplate.from_template(template) final_rag_chain = ( prompt | llm | StrOutputParser() ) final_rag_chain.invoke({"context" :retireved_docs,"question" :question})