diff --git a/src/refined/offline_data_generation/process_wiki.py b/src/refined/offline_data_generation/process_wiki.py index 9b6442f..7e012c4 100644 --- a/src/refined/offline_data_generation/process_wiki.py +++ b/src/refined/offline_data_generation/process_wiki.py @@ -61,7 +61,7 @@ def build_redirects(args=None): def generate_wiki_id_to_title(page_sql_gz_filepath: str, output_dir: str) -> Dict[str, str]: # page_id, namespace, title, restrictions, redirect, new, random, touched, links, latest, len, content_model, lang page_id_to_title: Dict[str, str] = dict() - pattern = re.compile("([0-9]+),([0-9]+),(.+),(.+),([0-9]+),([0-9]+),(.+),(.+),(.+),([0-9]+),([0-9]+),(.+),(.+)") + pattern = re.compile("([0-9]+),([0-9]+),'(.+)',([0-9]+),([0-9]+),([0-9\.]+),'(.+)','(.+)',([0-9]+),([0-9]+),'(.+)',(.+)") wiki_id_to_title_file = open(f'{output_dir}/wiki_id_to_title.json', 'w') with gzip.open(page_sql_gz_filepath, 'r') as f: for line in tqdm(f, total=5775): @@ -76,11 +76,10 @@ def generate_wiki_id_to_title(page_sql_gz_filepath: str, output_dir: str) -> Dic if m is None: continue groups = m.groups() - page_id, namespace, title, restrictions, redirect, new, random, touched, links, \ + page_id, namespace, title, redirect, new, random, touched, links, \ latest, length, content_model, lang = groups if not namespace == '0': continue - title = title[1:-1] page_id_to_title[page_id] = title wiki_id_to_title_file.write(json.dumps({'wiki_page_id': page_id, 'wiki_title': title}) + '\n') wiki_id_to_title_file.close()