Fix/circular import (#439)

tpoisonooo · openhands-agent · web-flow · commit e0d94b591462 · 2025-05-21T20:04:25.000+08:00
* Add QA pair support and refactor initialize method

* Update README with QA pair feature documentation

* Fix circular import warning in services module

* feat(project): update

---------

Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/README.md b/README.md
@@ -212,6 +212,11 @@ cp -rf resource/data* repodir/
 # Build knowledge base, this will save the features of repodir to workdir, and update the positive and negative example thresholds into `config.ini`
 mkdir workdir
 python3 -m huixiangdou.services.store
+
+# You can also build knowledge base from QA pairs (CSV or JSON format)
+# CSV: First column is key (question), second column is value (answer)
+# JSON: {"question1": "answer1", "question2": "answer2", ...}
+# python3 -m huixiangdou.services.store --qa-pair resource/data/qa_pair.csv
 ```
 
 ## III. Setup LLM API and test
diff --git a/README_zh.md b/README_zh.md
@@ -211,6 +211,11 @@ cp -rf resource/data* repodir/
 # 建立知识库，repodir 的特征会保存到 workdir，拒答阈值也会自动更新进 `config.ini`
 mkdir workdir
 python3 -m huixiangdou.services.store
+
+# 你也可以从问答对（QA pairs）构建知识库（支持 CSV 或 JSON 格式）
+# CSV 格式：第一列为问题（key），第二列为答案（value）
+# JSON 格式：{"问题1": "答案1", "问题2": "答案2", ...}
+# python3 -m huixiangdou.services.store --qa-pair resource/data/qa_pair.csv
 ```
 
 ## 三、配置 LLM，运行测试
diff --git a/huixiangdou/primitive/chunk.py b/huixiangdou/primitive/chunk.py
@@ -22,9 +22,9 @@ class Chunk():
     modal: str = 'text'
 
     def __post_init__(self):
-        if self.modal not in ['text', 'image', 'audio']:
+        if self.modal not in ['text', 'image', 'audio', 'qa']:
             raise ValueError(
-                f'Invalid modal: {self.modal}. Allowed values are: `text`, `image`, `audio`'
+                f'Invalid modal: {self.modal}. Allowed values are: `text`, `image`, `audio`, `qa`'
             )
 
     def __str__(self) -> str:
diff --git a/huixiangdou/primitive/faiss.py b/huixiangdou/primitive/faiss.py
@@ -167,7 +167,7 @@ def save_local(self, folder_path: str, chunks: List[Chunk],
             for chunk in tqdm(chunks, 'chunks'):
                 np_feature = None
                 try:
-                    if chunk.modal == 'text':
+                    if chunk.modal == 'text' or chunk.modal == 'qa':
                         np_feature = embedder.embed_query(text=chunk.content_or_path)
                     elif chunk.modal == 'image':
                         np_feature = embedder.embed_query(path=chunk.content_or_path)
diff --git a/huixiangdou/services/__init__.py b/huixiangdou/services/__init__.py
@@ -1,7 +1,6 @@
 """LLM service module."""
 from .config import (feature_store_base_dir, redis_host, redis_passwd,
                      redis_port)
-from .store import FeatureStore  # noqa E401
 from .helper import (ErrorCode, QueryTracker, Queue, TaskCode,
                      build_reply_text, check_str_useful, histogram, kimi_ocr,
                      multimodal, parse_json_str)
@@ -10,3 +9,5 @@
 from .web_search import WebSearch  # noqa E401
 from .serial_pipeline import SerialPipeline
 from .parallel_pipeline import ParallelPipeline
+# Import FeatureStore at the end to avoid circular imports
+from .store import FeatureStore  # noqa E401
diff --git a/huixiangdou/services/llm.py b/huixiangdou/services/llm.py
@@ -193,8 +193,7 @@ async def chat(self,
         try:
             response = await openai_async_client.chat.completions.create(**kwargs)
         except Exception as e:
-            import pdb
-            pdb.set_trace()
+            logger.error(str(e) + ' input len {}'.format(len(str(messages))))
             pass
         logger.info(response.choices[0].message.content)
 
diff --git a/huixiangdou/services/retriever.py b/huixiangdou/services/retriever.py
@@ -169,7 +169,7 @@ def rerank_fuse(self, query: Union[Query, str], chunks: List[Chunk], context_max
         context = ''
         references = []
         ref_texts = []
-        for idx, chunk in enumerate(rerank_chunks):
+        for chunk in rerank_chunks:
 
             content = chunk.content_or_path
             splits.append(content)
@@ -178,12 +178,14 @@ def rerank_fuse(self, query: Union[Query, str], chunks: List[Chunk], context_max
             if '://' in source:
                 # url
                 file_text = content
-            else:
+            elif chunk.modal == 'text':
                 file_text, error = file_opr.read(chunk.metadata['read'])
                 if error is not None:
                     # read file failed, skip
                     continue
-
+            elif chunk.modal == 'qa':
+                file_text = chunk.metadata['qa']
+     
             logger.info('target {} content length {}'.format(
                 source, len(file_text)))
             if len(file_text) + len(context) > context_max_length:
diff --git a/huixiangdou/services/store.py b/huixiangdou/services/store.py
@@ -6,8 +6,10 @@
 import re
 import shutil
 import time
+import csv
+from dataclasses import dataclass
 from multiprocessing import Pool
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 import random
 import pytoml
 from loguru import logger
@@ -22,6 +24,15 @@
 from .helper import histogram
 from .retriever import CacheRetriever, Retriever
 
+
+@dataclass
+class InitializeConfig:
+    """Configuration for initializing the feature store."""
+    files: List[FileName]
+    work_dir: str
+    ner_file: Optional[str] = None
+    qa_pair_file: Optional[str] = None
+
 def empty_cache():
     try:
         from torch.cuda import empty_cache as cuda_empty_cache
@@ -170,7 +181,68 @@ def build_sparse(self, files: List[FileName], work_dir: str):
         bm25 = BM25Okapi()
         bm25.save(chunks, sparse_dir)
 
-    def build_dense(self, files: List[FileName], work_dir: str, markdown_as_txt: bool=False):
+    def process_qa_pairs(self, qa_pair_file: str) -> List[Chunk]:
+        """Process QA pairs from CSV or JSON file.
+        
+        Args:
+            qa_pair_file: Path to the CSV or JSON file containing QA pairs.
+            
+        Returns:
+            List of Chunk objects where key is the content and value is stored in metadata.
+        """
+        chunks = []
+        file_ext = os.path.splitext(qa_pair_file)[1].lower()
+        
+        try:
+            if file_ext == '.csv':
+                # Process CSV file - first column is key, second column is value
+                with open(qa_pair_file, 'r', encoding='utf-8') as f:
+                    csv_reader = csv.reader(f)
+                    for row in csv_reader:
+                        if len(row) >= 2:
+                            key, value = row[0], row[1]
+                            # Create a chunk with key as content and value in metadata
+                            chunk = Chunk(
+                                modal='qa',
+                                content_or_path=key,
+                                metadata={'read': qa_pair_file, 'source': qa_pair_file, 'qa': f'{key}: {value}'}
+                            )
+                            chunks.append(chunk)
+            
+            elif file_ext == '.json':
+                # Process JSON file
+                with open(qa_pair_file, 'r', encoding='utf-8') as f:
+                    qa_data = json.load(f)
+                    
+                    # Handle different JSON formats
+                    if isinstance(qa_data, dict):
+                        # Format: {"key1": "value1", "key2": "value2", ...}
+                        for key, value in qa_data.items():
+                            chunk = Chunk(
+                                modal='qa',
+                                content_or_path=key,
+                                metadata={'read': qa_pair_file, 'source': qa_pair_file, 'qa': f'{key}: {value}'}
+                            )
+                            chunks.append(chunk)
+                    elif isinstance(qa_data, list):
+                        # Format: [{"key": "key1", "value": "value1"}, ...]
+                        for item in qa_data:
+                            if isinstance(item, dict) and 'key' in item and 'value' in item:
+                                chunk = Chunk(
+                                    modal='qa',
+                                    content_or_path=key,
+                                    metadata={'read': qa_pair_file, 'source': qa_pair_file, 'qa': f'{key}: {value}'}
+                                )
+                                chunks.append(chunk)
+            
+            logger.info(f"Processed {len(chunks)} QA pairs from {qa_pair_file}")
+            return chunks
+            
+        except Exception as e:
+            logger.error(f"Error processing QA pairs from {qa_pair_file}: {str(e)}")
+            return []
+
+    def build_dense(self, files: List[FileName], work_dir: str, markdown_as_txt: bool=False, qa_pair_file: str = None):
         """Extract the features required for the response pipeline based on the
         document."""
         feature_dir = os.path.join(work_dir, 'db_dense')
@@ -179,7 +251,14 @@ def build_dense(self, files: List[FileName], work_dir: str, markdown_as_txt: boo
 
         file_opr = FileOperation()
         chunks = []
-
+        
+        # Process QA pairs if provided
+        if qa_pair_file is not None:
+            qa_chunks = self.process_qa_pairs(qa_pair_file)
+            chunks.extend(qa_chunks)
+            logger.info(f"Added {len(qa_chunks)} chunks from QA pairs")
+        
+        # Process regular files
         for i, file in tqdm(enumerate(files), 'split'):
             if not file.state:
                 continue
@@ -205,7 +284,7 @@ def build_dense(self, files: List[FileName], work_dir: str, markdown_as_txt: boo
                     texts=[text], metadatas=[metadata])
 
         if not self.embedder.support_image:
-            filtered_chunks = list(filter(lambda x: x.modal=='text', chunks))
+            filtered_chunks = list(filter(lambda x: x.modal=='text' or x.modal=='qa', chunks))
         else:
             filtered_chunks = chunks
         if len(chunks) < 1:
@@ -318,24 +397,27 @@ def preprocess(self, files: List, work_dir: str):
                     file.state = False
                     file.reason = 'read error'
 
-    def initialize(self, files: list, ner_file:str, work_dir: str):
+    def initialize(self, config: InitializeConfig):
         """Initializes response and reject feature store.
 
         Only needs to be called once. Also calculates the optimal threshold
         based on provided good and bad question examples, and saves it in the
         configuration file.
+        
+        Args:
+            config: Configuration object containing initialization parameters
         """
         logger.info(
             'initialize response and reject feature store, you only need call this once.'  # noqa E501
         )
-        self.preprocess(files=files, work_dir=work_dir)
+        self.preprocess(files=config.files, work_dir=config.work_dir)
         # build dense retrieval refusal-to-answer and response database
-        documents = list(filter(lambda x: x._type != 'code', files))
-        chunks = self.build_dense(files=documents, work_dir=work_dir)
+        documents = list(filter(lambda x: x._type != 'code', config.files))
+        chunks = self.build_dense(files=documents, work_dir=config.work_dir, qa_pair_file=config.qa_pair_file)
 
-        codes = list(filter(lambda x: x._type == 'code', files))
-        self.build_sparse(files=codes, work_dir=work_dir)
-        self.build_inverted_index(chunks=chunks, ner_file=ner_file, work_dir=work_dir)
+        codes = list(filter(lambda x: x._type == 'code', config.files))
+        self.build_sparse(files=codes, work_dir=config.work_dir)
+        self.build_inverted_index(chunks=chunks, ner_file=config.ner_file, work_dir=config.work_dir)
 
 def parse_args():
     """Parse command-line arguments."""
@@ -371,6 +453,11 @@ def parse_args():
         default=None,
         help='The path of NER file, which is a dumped json list. HuixiangDou would build relationship between entities and chunks for retrieve.'
     )
+    parser.add_argument(
+        '--qa-pair',
+        default=None,
+        help='Path to a CSV or JSON file containing QA pairs. For CSV, the first column is the key and the second column is the value. For JSON, the format should be {"key":"value"} or a list of {"key":"key1", "value":"value1"}.'
+    )
     parser.add_argument(
         '--sample', help='Input an json file, save reject and search output.')
     parser.add_argument(
@@ -459,8 +546,16 @@ def test_query(retriever: Retriever, sample: str = None):
     file_opr = FileOperation()
 
     files = file_opr.scan_dir(repo_dir=args.repo_dir)
+    
+    # Create configuration object
+    init_config = InitializeConfig(
+        files=files,
+        work_dir=args.work_dir,
+        ner_file=args.ner_file,
+        qa_pair_file=args.qa_pair
+    )
 
-    fs_init.initialize(files=files, ner_file=args.ner_file, work_dir=args.work_dir)
+    fs_init.initialize(config=init_config)
     file_opr.summarize(files)
     del fs_init
 
diff --git a/resource/data/qa_pair.csv b/resource/data/qa_pair.csv
@@ -0,0 +1,3 @@
+"What is HuixiangDou?","HuixiangDou is an AI assistant that can answer questions based on your knowledge base."
+"How to use HuixiangDou?","You can use HuixiangDou by providing a knowledge base and asking questions related to it."
+"What features does HuixiangDou support?","HuixiangDou supports text embedding, document retrieval, and question answering."

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+"What is HuixiangDou?","HuixiangDou is an AI assistant that can answer questions based on your knowledge base."`
	`2`	`+"How to use HuixiangDou?","You can use HuixiangDou by providing a knowledge base and asking questions related to it."`
	`3`	`+"What features does HuixiangDou support?","HuixiangDou supports text embedding, document retrieval, and question answering."`