66import re
77import shutil
88import time
9+ import csv
10+ from dataclasses import dataclass
911from multiprocessing import Pool
10- from typing import Any , Dict , List , Optional
12+ from typing import Any , Dict , List , Optional , Tuple
1113import random
1214import pytoml
1315from loguru import logger
2224from .helper import histogram
2325from .retriever import CacheRetriever , Retriever
2426
27+
28+ @dataclass
29+ class InitializeConfig :
30+ """Configuration for initializing the feature store."""
31+ files : List [FileName ]
32+ work_dir : str
33+ ner_file : Optional [str ] = None
34+ qa_pair_file : Optional [str ] = None
35+
2536def empty_cache ():
2637 try :
2738 from torch .cuda import empty_cache as cuda_empty_cache
@@ -170,7 +181,68 @@ def build_sparse(self, files: List[FileName], work_dir: str):
170181 bm25 = BM25Okapi ()
171182 bm25 .save (chunks , sparse_dir )
172183
173- def build_dense (self , files : List [FileName ], work_dir : str , markdown_as_txt : bool = False ):
184+ def process_qa_pairs (self , qa_pair_file : str ) -> List [Chunk ]:
185+ """Process QA pairs from CSV or JSON file.
186+
187+ Args:
188+ qa_pair_file: Path to the CSV or JSON file containing QA pairs.
189+
190+ Returns:
191+ List of Chunk objects where key is the content and value is stored in metadata.
192+ """
193+ chunks = []
194+ file_ext = os .path .splitext (qa_pair_file )[1 ].lower ()
195+
196+ try :
197+ if file_ext == '.csv' :
198+ # Process CSV file - first column is key, second column is value
199+ with open (qa_pair_file , 'r' , encoding = 'utf-8' ) as f :
200+ csv_reader = csv .reader (f )
201+ for row in csv_reader :
202+ if len (row ) >= 2 :
203+ key , value = row [0 ], row [1 ]
204+ # Create a chunk with key as content and value in metadata
205+ chunk = Chunk (
206+ modal = 'qa' ,
207+ content_or_path = key ,
208+ metadata = {'read' : qa_pair_file , 'source' : qa_pair_file , 'qa' : f'{ key } : { value } ' }
209+ )
210+ chunks .append (chunk )
211+
212+ elif file_ext == '.json' :
213+ # Process JSON file
214+ with open (qa_pair_file , 'r' , encoding = 'utf-8' ) as f :
215+ qa_data = json .load (f )
216+
217+ # Handle different JSON formats
218+ if isinstance (qa_data , dict ):
219+ # Format: {"key1": "value1", "key2": "value2", ...}
220+ for key , value in qa_data .items ():
221+ chunk = Chunk (
222+ modal = 'qa' ,
223+ content_or_path = key ,
224+ metadata = {'read' : qa_pair_file , 'source' : qa_pair_file , 'qa' : f'{ key } : { value } ' }
225+ )
226+ chunks .append (chunk )
227+ elif isinstance (qa_data , list ):
228+ # Format: [{"key": "key1", "value": "value1"}, ...]
229+ for item in qa_data :
230+ if isinstance (item , dict ) and 'key' in item and 'value' in item :
231+ chunk = Chunk (
232+ modal = 'qa' ,
233+ content_or_path = key ,
234+ metadata = {'read' : qa_pair_file , 'source' : qa_pair_file , 'qa' : f'{ key } : { value } ' }
235+ )
236+ chunks .append (chunk )
237+
238+ logger .info (f"Processed { len (chunks )} QA pairs from { qa_pair_file } " )
239+ return chunks
240+
241+ except Exception as e :
242+ logger .error (f"Error processing QA pairs from { qa_pair_file } : { str (e )} " )
243+ return []
244+
245+ def build_dense (self , files : List [FileName ], work_dir : str , markdown_as_txt : bool = False , qa_pair_file : str = None ):
174246 """Extract the features required for the response pipeline based on the
175247 document."""
176248 feature_dir = os .path .join (work_dir , 'db_dense' )
@@ -179,7 +251,14 @@ def build_dense(self, files: List[FileName], work_dir: str, markdown_as_txt: boo
179251
180252 file_opr = FileOperation ()
181253 chunks = []
182-
254+
255+ # Process QA pairs if provided
256+ if qa_pair_file is not None :
257+ qa_chunks = self .process_qa_pairs (qa_pair_file )
258+ chunks .extend (qa_chunks )
259+ logger .info (f"Added { len (qa_chunks )} chunks from QA pairs" )
260+
261+ # Process regular files
183262 for i , file in tqdm (enumerate (files ), 'split' ):
184263 if not file .state :
185264 continue
@@ -205,7 +284,7 @@ def build_dense(self, files: List[FileName], work_dir: str, markdown_as_txt: boo
205284 texts = [text ], metadatas = [metadata ])
206285
207286 if not self .embedder .support_image :
208- filtered_chunks = list (filter (lambda x : x .modal == 'text' , chunks ))
287+ filtered_chunks = list (filter (lambda x : x .modal == 'text' or x . modal == 'qa' , chunks ))
209288 else :
210289 filtered_chunks = chunks
211290 if len (chunks ) < 1 :
@@ -318,24 +397,27 @@ def preprocess(self, files: List, work_dir: str):
318397 file .state = False
319398 file .reason = 'read error'
320399
321- def initialize (self , files : list , ner_file : str , work_dir : str ):
400+ def initialize (self , config : InitializeConfig ):
322401 """Initializes response and reject feature store.
323402
324403 Only needs to be called once. Also calculates the optimal threshold
325404 based on provided good and bad question examples, and saves it in the
326405 configuration file.
406+
407+ Args:
408+ config: Configuration object containing initialization parameters
327409 """
328410 logger .info (
329411 'initialize response and reject feature store, you only need call this once.' # noqa E501
330412 )
331- self .preprocess (files = files , work_dir = work_dir )
413+ self .preprocess (files = config . files , work_dir = config . work_dir )
332414 # build dense retrieval refusal-to-answer and response database
333- documents = list (filter (lambda x : x ._type != 'code' , files ))
334- chunks = self .build_dense (files = documents , work_dir = work_dir )
415+ documents = list (filter (lambda x : x ._type != 'code' , config . files ))
416+ chunks = self .build_dense (files = documents , work_dir = config . work_dir , qa_pair_file = config . qa_pair_file )
335417
336- codes = list (filter (lambda x : x ._type == 'code' , files ))
337- self .build_sparse (files = codes , work_dir = work_dir )
338- self .build_inverted_index (chunks = chunks , ner_file = ner_file , work_dir = work_dir )
418+ codes = list (filter (lambda x : x ._type == 'code' , config . files ))
419+ self .build_sparse (files = codes , work_dir = config . work_dir )
420+ self .build_inverted_index (chunks = chunks , ner_file = config . ner_file , work_dir = config . work_dir )
339421
340422def parse_args ():
341423 """Parse command-line arguments."""
@@ -371,6 +453,11 @@ def parse_args():
371453 default = None ,
372454 help = 'The path of NER file, which is a dumped json list. HuixiangDou would build relationship between entities and chunks for retrieve.'
373455 )
456+ parser .add_argument (
457+ '--qa-pair' ,
458+ default = None ,
459+ help = 'Path to a CSV or JSON file containing QA pairs. For CSV, the first column is the key and the second column is the value. For JSON, the format should be {"key":"value"} or a list of {"key":"key1", "value":"value1"}.'
460+ )
374461 parser .add_argument (
375462 '--sample' , help = 'Input an json file, save reject and search output.' )
376463 parser .add_argument (
@@ -459,8 +546,16 @@ def test_query(retriever: Retriever, sample: str = None):
459546 file_opr = FileOperation ()
460547
461548 files = file_opr .scan_dir (repo_dir = args .repo_dir )
549+
550+ # Create configuration object
551+ init_config = InitializeConfig (
552+ files = files ,
553+ work_dir = args .work_dir ,
554+ ner_file = args .ner_file ,
555+ qa_pair_file = args .qa_pair
556+ )
462557
463- fs_init .initialize (files = files , ner_file = args . ner_file , work_dir = args . work_dir )
558+ fs_init .initialize (config = init_config )
464559 file_opr .summarize (files )
465560 del fs_init
466561
0 commit comments