Skip to content

Commit e0d94b5

Browse files
Fix/circular import (#439)
* Add QA pair support and refactor initialize method * Update README with QA pair feature documentation * Fix circular import warning in services module * feat(project): update --------- Co-authored-by: openhands <[email protected]>
1 parent 35ae981 commit e0d94b5

File tree

9 files changed

+131
-21
lines changed

9 files changed

+131
-21
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,11 @@ cp -rf resource/data* repodir/
212212
# Build knowledge base, this will save the features of repodir to workdir, and update the positive and negative example thresholds into `config.ini`
213213
mkdir workdir
214214
python3 -m huixiangdou.services.store
215+
216+
# You can also build knowledge base from QA pairs (CSV or JSON format)
217+
# CSV: First column is key (question), second column is value (answer)
218+
# JSON: {"question1": "answer1", "question2": "answer2", ...}
219+
# python3 -m huixiangdou.services.store --qa-pair resource/data/qa_pair.csv
215220
```
216221

217222
## III. Setup LLM API and test

README_zh.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,11 @@ cp -rf resource/data* repodir/
211211
# 建立知识库,repodir 的特征会保存到 workdir,拒答阈值也会自动更新进 `config.ini`
212212
mkdir workdir
213213
python3 -m huixiangdou.services.store
214+
215+
# 你也可以从问答对(QA pairs)构建知识库(支持 CSV 或 JSON 格式)
216+
# CSV 格式:第一列为问题(key),第二列为答案(value)
217+
# JSON 格式:{"问题1": "答案1", "问题2": "答案2", ...}
218+
# python3 -m huixiangdou.services.store --qa-pair resource/data/qa_pair.csv
214219
```
215220

216221
## 三、配置 LLM,运行测试

huixiangdou/primitive/chunk.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ class Chunk():
2222
modal: str = 'text'
2323

2424
def __post_init__(self):
25-
if self.modal not in ['text', 'image', 'audio']:
25+
if self.modal not in ['text', 'image', 'audio', 'qa']:
2626
raise ValueError(
27-
f'Invalid modal: {self.modal}. Allowed values are: `text`, `image`, `audio`'
27+
f'Invalid modal: {self.modal}. Allowed values are: `text`, `image`, `audio`, `qa`'
2828
)
2929

3030
def __str__(self) -> str:

huixiangdou/primitive/faiss.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def save_local(self, folder_path: str, chunks: List[Chunk],
167167
for chunk in tqdm(chunks, 'chunks'):
168168
np_feature = None
169169
try:
170-
if chunk.modal == 'text':
170+
if chunk.modal == 'text' or chunk.modal == 'qa':
171171
np_feature = embedder.embed_query(text=chunk.content_or_path)
172172
elif chunk.modal == 'image':
173173
np_feature = embedder.embed_query(path=chunk.content_or_path)

huixiangdou/services/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""LLM service module."""
22
from .config import (feature_store_base_dir, redis_host, redis_passwd,
33
redis_port)
4-
from .store import FeatureStore # noqa E401
54
from .helper import (ErrorCode, QueryTracker, Queue, TaskCode,
65
build_reply_text, check_str_useful, histogram, kimi_ocr,
76
multimodal, parse_json_str)
@@ -10,3 +9,5 @@
109
from .web_search import WebSearch # noqa E401
1110
from .serial_pipeline import SerialPipeline
1211
from .parallel_pipeline import ParallelPipeline
12+
# Import FeatureStore at the end to avoid circular imports
13+
from .store import FeatureStore # noqa E401

huixiangdou/services/llm.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,8 +193,7 @@ async def chat(self,
193193
try:
194194
response = await openai_async_client.chat.completions.create(**kwargs)
195195
except Exception as e:
196-
import pdb
197-
pdb.set_trace()
196+
logger.error(str(e) + ' input len {}'.format(len(str(messages))))
198197
pass
199198
logger.info(response.choices[0].message.content)
200199

huixiangdou/services/retriever.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def rerank_fuse(self, query: Union[Query, str], chunks: List[Chunk], context_max
169169
context = ''
170170
references = []
171171
ref_texts = []
172-
for idx, chunk in enumerate(rerank_chunks):
172+
for chunk in rerank_chunks:
173173

174174
content = chunk.content_or_path
175175
splits.append(content)
@@ -178,12 +178,14 @@ def rerank_fuse(self, query: Union[Query, str], chunks: List[Chunk], context_max
178178
if '://' in source:
179179
# url
180180
file_text = content
181-
else:
181+
elif chunk.modal == 'text':
182182
file_text, error = file_opr.read(chunk.metadata['read'])
183183
if error is not None:
184184
# read file failed, skip
185185
continue
186-
186+
elif chunk.modal == 'qa':
187+
file_text = chunk.metadata['qa']
188+
187189
logger.info('target {} content length {}'.format(
188190
source, len(file_text)))
189191
if len(file_text) + len(context) > context_max_length:

huixiangdou/services/store.py

Lines changed: 107 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
import re
77
import shutil
88
import time
9+
import csv
10+
from dataclasses import dataclass
911
from multiprocessing import Pool
10-
from typing import Any, Dict, List, Optional
12+
from typing import Any, Dict, List, Optional, Tuple
1113
import random
1214
import pytoml
1315
from loguru import logger
@@ -22,6 +24,15 @@
2224
from .helper import histogram
2325
from .retriever import CacheRetriever, Retriever
2426

27+
28+
@dataclass
29+
class InitializeConfig:
30+
"""Configuration for initializing the feature store."""
31+
files: List[FileName]
32+
work_dir: str
33+
ner_file: Optional[str] = None
34+
qa_pair_file: Optional[str] = None
35+
2536
def empty_cache():
2637
try:
2738
from torch.cuda import empty_cache as cuda_empty_cache
@@ -170,7 +181,68 @@ def build_sparse(self, files: List[FileName], work_dir: str):
170181
bm25 = BM25Okapi()
171182
bm25.save(chunks, sparse_dir)
172183

173-
def build_dense(self, files: List[FileName], work_dir: str, markdown_as_txt: bool=False):
184+
def process_qa_pairs(self, qa_pair_file: str) -> List[Chunk]:
185+
"""Process QA pairs from CSV or JSON file.
186+
187+
Args:
188+
qa_pair_file: Path to the CSV or JSON file containing QA pairs.
189+
190+
Returns:
191+
List of Chunk objects where key is the content and value is stored in metadata.
192+
"""
193+
chunks = []
194+
file_ext = os.path.splitext(qa_pair_file)[1].lower()
195+
196+
try:
197+
if file_ext == '.csv':
198+
# Process CSV file - first column is key, second column is value
199+
with open(qa_pair_file, 'r', encoding='utf-8') as f:
200+
csv_reader = csv.reader(f)
201+
for row in csv_reader:
202+
if len(row) >= 2:
203+
key, value = row[0], row[1]
204+
# Create a chunk with key as content and value in metadata
205+
chunk = Chunk(
206+
modal='qa',
207+
content_or_path=key,
208+
metadata={'read': qa_pair_file, 'source': qa_pair_file, 'qa': f'{key}: {value}'}
209+
)
210+
chunks.append(chunk)
211+
212+
elif file_ext == '.json':
213+
# Process JSON file
214+
with open(qa_pair_file, 'r', encoding='utf-8') as f:
215+
qa_data = json.load(f)
216+
217+
# Handle different JSON formats
218+
if isinstance(qa_data, dict):
219+
# Format: {"key1": "value1", "key2": "value2", ...}
220+
for key, value in qa_data.items():
221+
chunk = Chunk(
222+
modal='qa',
223+
content_or_path=key,
224+
metadata={'read': qa_pair_file, 'source': qa_pair_file, 'qa': f'{key}: {value}'}
225+
)
226+
chunks.append(chunk)
227+
elif isinstance(qa_data, list):
228+
# Format: [{"key": "key1", "value": "value1"}, ...]
229+
for item in qa_data:
230+
if isinstance(item, dict) and 'key' in item and 'value' in item:
231+
chunk = Chunk(
232+
modal='qa',
233+
content_or_path=key,
234+
metadata={'read': qa_pair_file, 'source': qa_pair_file, 'qa': f'{key}: {value}'}
235+
)
236+
chunks.append(chunk)
237+
238+
logger.info(f"Processed {len(chunks)} QA pairs from {qa_pair_file}")
239+
return chunks
240+
241+
except Exception as e:
242+
logger.error(f"Error processing QA pairs from {qa_pair_file}: {str(e)}")
243+
return []
244+
245+
def build_dense(self, files: List[FileName], work_dir: str, markdown_as_txt: bool=False, qa_pair_file: str = None):
174246
"""Extract the features required for the response pipeline based on the
175247
document."""
176248
feature_dir = os.path.join(work_dir, 'db_dense')
@@ -179,7 +251,14 @@ def build_dense(self, files: List[FileName], work_dir: str, markdown_as_txt: boo
179251

180252
file_opr = FileOperation()
181253
chunks = []
182-
254+
255+
# Process QA pairs if provided
256+
if qa_pair_file is not None:
257+
qa_chunks = self.process_qa_pairs(qa_pair_file)
258+
chunks.extend(qa_chunks)
259+
logger.info(f"Added {len(qa_chunks)} chunks from QA pairs")
260+
261+
# Process regular files
183262
for i, file in tqdm(enumerate(files), 'split'):
184263
if not file.state:
185264
continue
@@ -205,7 +284,7 @@ def build_dense(self, files: List[FileName], work_dir: str, markdown_as_txt: boo
205284
texts=[text], metadatas=[metadata])
206285

207286
if not self.embedder.support_image:
208-
filtered_chunks = list(filter(lambda x: x.modal=='text', chunks))
287+
filtered_chunks = list(filter(lambda x: x.modal=='text' or x.modal=='qa', chunks))
209288
else:
210289
filtered_chunks = chunks
211290
if len(chunks) < 1:
@@ -318,24 +397,27 @@ def preprocess(self, files: List, work_dir: str):
318397
file.state = False
319398
file.reason = 'read error'
320399

321-
def initialize(self, files: list, ner_file:str, work_dir: str):
400+
def initialize(self, config: InitializeConfig):
322401
"""Initializes response and reject feature store.
323402
324403
Only needs to be called once. Also calculates the optimal threshold
325404
based on provided good and bad question examples, and saves it in the
326405
configuration file.
406+
407+
Args:
408+
config: Configuration object containing initialization parameters
327409
"""
328410
logger.info(
329411
'initialize response and reject feature store, you only need call this once.' # noqa E501
330412
)
331-
self.preprocess(files=files, work_dir=work_dir)
413+
self.preprocess(files=config.files, work_dir=config.work_dir)
332414
# build dense retrieval refusal-to-answer and response database
333-
documents = list(filter(lambda x: x._type != 'code', files))
334-
chunks = self.build_dense(files=documents, work_dir=work_dir)
415+
documents = list(filter(lambda x: x._type != 'code', config.files))
416+
chunks = self.build_dense(files=documents, work_dir=config.work_dir, qa_pair_file=config.qa_pair_file)
335417

336-
codes = list(filter(lambda x: x._type == 'code', files))
337-
self.build_sparse(files=codes, work_dir=work_dir)
338-
self.build_inverted_index(chunks=chunks, ner_file=ner_file, work_dir=work_dir)
418+
codes = list(filter(lambda x: x._type == 'code', config.files))
419+
self.build_sparse(files=codes, work_dir=config.work_dir)
420+
self.build_inverted_index(chunks=chunks, ner_file=config.ner_file, work_dir=config.work_dir)
339421

340422
def parse_args():
341423
"""Parse command-line arguments."""
@@ -371,6 +453,11 @@ def parse_args():
371453
default=None,
372454
help='The path of NER file, which is a dumped json list. HuixiangDou would build relationship between entities and chunks for retrieve.'
373455
)
456+
parser.add_argument(
457+
'--qa-pair',
458+
default=None,
459+
help='Path to a CSV or JSON file containing QA pairs. For CSV, the first column is the key and the second column is the value. For JSON, the format should be {"key":"value"} or a list of {"key":"key1", "value":"value1"}.'
460+
)
374461
parser.add_argument(
375462
'--sample', help='Input an json file, save reject and search output.')
376463
parser.add_argument(
@@ -459,8 +546,16 @@ def test_query(retriever: Retriever, sample: str = None):
459546
file_opr = FileOperation()
460547

461548
files = file_opr.scan_dir(repo_dir=args.repo_dir)
549+
550+
# Create configuration object
551+
init_config = InitializeConfig(
552+
files=files,
553+
work_dir=args.work_dir,
554+
ner_file=args.ner_file,
555+
qa_pair_file=args.qa_pair
556+
)
462557

463-
fs_init.initialize(files=files, ner_file=args.ner_file, work_dir=args.work_dir)
558+
fs_init.initialize(config=init_config)
464559
file_opr.summarize(files)
465560
del fs_init
466561

resource/data/qa_pair.csv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"What is HuixiangDou?","HuixiangDou is an AI assistant that can answer questions based on your knowledge base."
2+
"How to use HuixiangDou?","You can use HuixiangDou by providing a knowledge base and asking questions related to it."
3+
"What features does HuixiangDou support?","HuixiangDou supports text embedding, document retrieval, and question answering."

0 commit comments

Comments
 (0)