Spaces:
Build error
Build error
temp dir for each survey
Browse files- app.py +26 -19
- src/Surveyor.py +36 -41
app.py
CHANGED
|
@@ -9,25 +9,32 @@ from pathlib import Path
|
|
| 9 |
from src.Surveyor import Surveyor
|
| 10 |
|
| 11 |
|
| 12 |
-
|
| 13 |
-
def get_surveyor_instance(
|
| 14 |
with st.spinner('Loading The-Researcher ...'):
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
def show_survey_download(zip_file_name, survey_file_name, download_placeholder):
|
|
@@ -81,7 +88,7 @@ if __name__ == '__main__':
|
|
| 81 |
submit = st.form_submit_button(label="Submit")
|
| 82 |
st.sidebar.write('#### execution log:')
|
| 83 |
|
| 84 |
-
run_kwargs = {'
|
| 85 |
'download_placeholder':download_placeholder}
|
| 86 |
if submit:
|
| 87 |
if session_data['research_keywords'] != '':
|
|
|
|
| 9 |
from src.Surveyor import Surveyor
|
| 10 |
|
| 11 |
|
| 12 |
+
@st.experimental_singleton(suppress_st_warning=True)
|
| 13 |
+
def get_surveyor_instance(_print_fn, _survey_print_fn):
|
| 14 |
with st.spinner('Loading The-Researcher ...'):
|
| 15 |
+
return Surveyor(print_fn=_print_fn, survey_print_fn=_survey_print_fn, high_gpu=True)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def run_survey(surveyor, download_placeholder, research_keywords=None, arxiv_ids=None, max_search=None, num_papers=None):
|
| 19 |
+
import hashlib
|
| 20 |
+
import time
|
| 21 |
+
|
| 22 |
+
hash = hashlib.sha1()
|
| 23 |
+
hash.update(str(time.time()))
|
| 24 |
+
temp_hash = hash.hexdigest()
|
| 25 |
+
survey_root = Path(temp_hash).resolve()
|
| 26 |
+
dir_args = {f'{dname}_dir': survey_root / dname for dname in ['pdf', 'txt', 'img', 'tab', 'dump']}
|
| 27 |
+
for d in dir_args.values():
|
| 28 |
+
d.mkdir(exist_ok=True, parents=True)
|
| 29 |
+
print(survey_root)
|
| 30 |
+
print(dir_args)
|
| 31 |
+
dir_args = {k: str(v.resolve()) for k, v in dir_args.items()}
|
| 32 |
+
zip_file_name, survey_file_name = surveyor.survey(research_keywords,
|
| 33 |
+
arxiv_ids,
|
| 34 |
+
max_search=max_search,
|
| 35 |
+
num_papers=num_papers
|
| 36 |
+
**dir_args)
|
| 37 |
+
show_survey_download(zip_file_name, survey_file_name, download_placeholder)
|
| 38 |
|
| 39 |
|
| 40 |
def show_survey_download(zip_file_name, survey_file_name, download_placeholder):
|
|
|
|
| 88 |
submit = st.form_submit_button(label="Submit")
|
| 89 |
st.sidebar.write('#### execution log:')
|
| 90 |
|
| 91 |
+
run_kwargs = {'surveyor':get_surveyor_instance(_print_fn=st.sidebar.write, _survey_print_fn=st.write),
|
| 92 |
'download_placeholder':download_placeholder}
|
| 93 |
if submit:
|
| 94 |
if session_data['research_keywords'] != '':
|
src/Surveyor.py
CHANGED
|
@@ -30,11 +30,6 @@ class Surveyor:
|
|
| 30 |
|
| 31 |
def __init__(
|
| 32 |
self,
|
| 33 |
-
pdf_dir=None,
|
| 34 |
-
txt_dir=None,
|
| 35 |
-
img_dir=None,
|
| 36 |
-
tab_dir=None,
|
| 37 |
-
dump_dir=None,
|
| 38 |
models_dir=None,
|
| 39 |
title_model_name=None,
|
| 40 |
ex_summ_model_name=None,
|
|
@@ -53,11 +48,6 @@ class Surveyor:
|
|
| 53 |
Initializes models and directory structure for the surveyor
|
| 54 |
|
| 55 |
Optional Params:
|
| 56 |
-
- pdf_dir: String, pdf paper storage directory - defaults to arxiv_data/tarpdfs/
|
| 57 |
-
- txt_dir: String, text-converted paper storage directory - defaults to arxiv_data/fulltext/
|
| 58 |
-
- img_dir: String, image image storage directory - defaults to arxiv_data/images/
|
| 59 |
-
- tab_dir: String, tables storage directory - defaults to arxiv_data/tables/
|
| 60 |
-
- dump_dir: String, all_output_dir - defaults to arxiv_dumps/
|
| 61 |
- models_dir: String, directory to save to huge models
|
| 62 |
- title_model_name: String, title model name/tag in hugging-face, defaults to `Callidior/bert2bert-base-arxiv-titlegen`
|
| 63 |
- ex_summ_model_name: String, extractive summary model name/tag in hugging-face, defaults to `allenai/scibert_scivocab_uncased`
|
|
@@ -192,41 +182,41 @@ class Surveyor:
|
|
| 192 |
self.similarity_nlp = spacy.load(similarity_nlp_name)
|
| 193 |
self.kw_model = KeyBERT(kw_model_name)
|
| 194 |
|
| 195 |
-
self.define_structure(pdf_dir=pdf_dir, txt_dir=txt_dir, img_dir=img_dir, tab_dir=tab_dir, dump_dir=dump_dir)
|
| 196 |
|
| 197 |
def define_structure(self, pdf_dir=None, txt_dir=None, img_dir=None, tab_dir=None, dump_dir=None):
|
| 198 |
|
| 199 |
if pdf_dir:
|
| 200 |
-
|
| 201 |
else:
|
| 202 |
-
|
| 203 |
|
| 204 |
if txt_dir:
|
| 205 |
-
|
| 206 |
else:
|
| 207 |
-
|
| 208 |
|
| 209 |
if img_dir:
|
| 210 |
-
|
| 211 |
else:
|
| 212 |
-
|
| 213 |
|
| 214 |
if tab_dir:
|
| 215 |
-
|
| 216 |
else:
|
| 217 |
-
|
| 218 |
|
| 219 |
if dump_dir:
|
| 220 |
-
|
| 221 |
else:
|
| 222 |
-
|
| 223 |
|
| 224 |
-
dirs = [
|
| 225 |
if sum([True for dir in dirs if 'arxiv_data/' in dir]):
|
| 226 |
base = os.path.dirname("arxiv_data/")
|
| 227 |
if not os.path.exists(base):
|
| 228 |
os.mkdir(base)
|
| 229 |
self.clean_dirs(dirs)
|
|
|
|
| 230 |
|
| 231 |
def clean_dirs(self, dirs):
|
| 232 |
import shutil
|
|
@@ -1345,9 +1335,14 @@ class Surveyor:
|
|
| 1345 |
zipf = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)
|
| 1346 |
zipdir(dump_dir, zipf)
|
| 1347 |
|
| 1348 |
-
def survey(self, query=None, id_list=None, max_search=None, num_papers=None, debug=False, weigh_authors=False
|
|
|
|
| 1349 |
import joblib
|
| 1350 |
import os, shutil
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1351 |
if not max_search:
|
| 1352 |
max_search = self.DEFAULTS['max_search']
|
| 1353 |
if not num_papers:
|
|
@@ -1357,39 +1352,39 @@ class Surveyor:
|
|
| 1357 |
# arxiv api relevance search and data preparation
|
| 1358 |
self.print_fn("\n- searching arXiv for top 100 papers.. ")
|
| 1359 |
results, searched_papers = self.search(query, id_list, max_search=max_search)
|
| 1360 |
-
joblib.dump(searched_papers,
|
| 1361 |
self.print_fn("\n- found " + str(len(searched_papers)) + " papers")
|
| 1362 |
|
| 1363 |
# paper selection by scibert vector embedding relevance scores
|
| 1364 |
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
| 1365 |
|
| 1366 |
-
papers_highlighted, papers_selected, cites = self.pdf_route(
|
| 1367 |
searched_papers)
|
| 1368 |
|
| 1369 |
if weigh_authors:
|
| 1370 |
authors = self.author_stats(papers_highlighted)
|
| 1371 |
|
| 1372 |
-
joblib.dump(papers_highlighted,
|
| 1373 |
|
| 1374 |
self.print_fn("\n- Standardizing known section headings per paper.. ")
|
| 1375 |
papers_standardized = self.standardize_headings(papers_highlighted)
|
| 1376 |
-
joblib.dump(papers_standardized,
|
| 1377 |
|
| 1378 |
self.print_fn("\n- Building paper-wise corpus.. ")
|
| 1379 |
corpus = self.build_corpus(papers_highlighted, searched_papers)
|
| 1380 |
-
joblib.dump(corpus,
|
| 1381 |
|
| 1382 |
self.print_fn("\n- Building section-wise corpus.. ")
|
| 1383 |
corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
|
| 1384 |
-
joblib.dump(corpus_sectionwise,
|
| 1385 |
|
| 1386 |
self.print_fn("\n- Building basic research highlights.. ")
|
| 1387 |
research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
|
| 1388 |
-
joblib.dump(research_blocks,
|
| 1389 |
|
| 1390 |
self.print_fn("\n- Reducing corpus to lines.. ")
|
| 1391 |
corpus_lines = self.get_corpus_lines(corpus)
|
| 1392 |
-
joblib.dump(corpus_lines,
|
| 1393 |
|
| 1394 |
# temp
|
| 1395 |
# searched_papers = joblib.load(dump_dir + 'papers_metadata.dmp')
|
|
@@ -1423,7 +1418,7 @@ class Surveyor:
|
|
| 1423 |
|
| 1424 |
self.print_fn("\n- Building abstract.. ")
|
| 1425 |
abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
|
| 1426 |
-
joblib.dump(abstract_block,
|
| 1427 |
'''
|
| 1428 |
self.print_fn("abstract_block type:"+ str(type(abstract_block)))
|
| 1429 |
self.print_fn("abstract_block:")
|
|
@@ -1432,7 +1427,7 @@ class Surveyor:
|
|
| 1432 |
|
| 1433 |
self.print_fn("\n- Building introduction.. ")
|
| 1434 |
intro_block = self.get_intro(corpus_sectionwise, research_blocks)
|
| 1435 |
-
joblib.dump(intro_block,
|
| 1436 |
'''
|
| 1437 |
self.print_fn("intro_block type:"+ str(type(intro_block)))
|
| 1438 |
self.print_fn("intro_block:")
|
|
@@ -1440,8 +1435,8 @@ class Surveyor:
|
|
| 1440 |
'''
|
| 1441 |
self.print_fn("\n- Building custom sections.. ")
|
| 1442 |
clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
|
| 1443 |
-
joblib.dump(clustered_sections,
|
| 1444 |
-
joblib.dump(clustered_sentences,
|
| 1445 |
|
| 1446 |
'''
|
| 1447 |
self.print_fn("clusters extracted")
|
|
@@ -1454,11 +1449,11 @@ class Surveyor:
|
|
| 1454 |
'''
|
| 1455 |
clustered_sections['abstract'] = abstract_block
|
| 1456 |
clustered_sections['introduction'] = intro_block
|
| 1457 |
-
joblib.dump(clustered_sections,
|
| 1458 |
|
| 1459 |
self.print_fn("\n- Building conclusion.. ")
|
| 1460 |
conclusion_block = self.get_conclusion(clustered_sections)
|
| 1461 |
-
joblib.dump(conclusion_block,
|
| 1462 |
clustered_sections['conclusion'] = conclusion_block
|
| 1463 |
'''
|
| 1464 |
self.print_fn("conclusion_block type:"+ str(type(conclusion_block)))
|
|
@@ -1469,18 +1464,18 @@ class Surveyor:
|
|
| 1469 |
query = self.generate_title(' '.join([v for v in clustered_sections.values()]))
|
| 1470 |
|
| 1471 |
survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
|
| 1472 |
-
survey_file = Path(
|
| 1473 |
self.build_doc(clustered_sections, papers_standardized, query=query, filename=str(survey_file))
|
| 1474 |
|
| 1475 |
self.survey_print_fn("\n-citation-network: ")
|
| 1476 |
self.survey_print_fn(cites)
|
| 1477 |
|
| 1478 |
-
shutil.copytree('arxiv_data/',
|
| 1479 |
assert (os.path.exists(survey_file))
|
| 1480 |
|
| 1481 |
zip_name = 'arxiv_dumps_'+query.replace(' ', '_')+'.zip'
|
| 1482 |
-
zip_name = Path(
|
| 1483 |
-
self.zip_outputs(
|
| 1484 |
self.print_fn("\n- Survey complete.. \nSurvey file path :" + str(survey_file) +
|
| 1485 |
"\nAll outputs zip path :" + str(zip_name))
|
| 1486 |
|
|
|
|
| 30 |
|
| 31 |
def __init__(
|
| 32 |
self,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
models_dir=None,
|
| 34 |
title_model_name=None,
|
| 35 |
ex_summ_model_name=None,
|
|
|
|
| 48 |
Initializes models and directory structure for the surveyor
|
| 49 |
|
| 50 |
Optional Params:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
- models_dir: String, directory to save to huge models
|
| 52 |
- title_model_name: String, title model name/tag in hugging-face, defaults to `Callidior/bert2bert-base-arxiv-titlegen`
|
| 53 |
- ex_summ_model_name: String, extractive summary model name/tag in hugging-face, defaults to `allenai/scibert_scivocab_uncased`
|
|
|
|
| 182 |
self.similarity_nlp = spacy.load(similarity_nlp_name)
|
| 183 |
self.kw_model = KeyBERT(kw_model_name)
|
| 184 |
|
|
|
|
| 185 |
|
| 186 |
def define_structure(self, pdf_dir=None, txt_dir=None, img_dir=None, tab_dir=None, dump_dir=None):
|
| 187 |
|
| 188 |
if pdf_dir:
|
| 189 |
+
survey_pdf_dir = pdf_dir
|
| 190 |
else:
|
| 191 |
+
survey_pdf_dir = self.DEFAULTS["pdf_dir"]
|
| 192 |
|
| 193 |
if txt_dir:
|
| 194 |
+
survey_txt_dir = txt_dir
|
| 195 |
else:
|
| 196 |
+
survey_txt_dir = self.DEFAULTS["txt_dir"]
|
| 197 |
|
| 198 |
if img_dir:
|
| 199 |
+
survey_img_dir = img_dir
|
| 200 |
else:
|
| 201 |
+
survey_img_dir = self.DEFAULTS["img_dir"]
|
| 202 |
|
| 203 |
if tab_dir:
|
| 204 |
+
survey_tab_dir = tab_dir
|
| 205 |
else:
|
| 206 |
+
survey_tab_dir = self.DEFAULTS["tab_dir"]
|
| 207 |
|
| 208 |
if dump_dir:
|
| 209 |
+
survey_dump_dir = dump_dir
|
| 210 |
else:
|
| 211 |
+
survey_dump_dir = self.DEFAULTS["dump_dir"]
|
| 212 |
|
| 213 |
+
dirs = [survey_pdf_dir, survey_txt_dir, survey_img_dir, survey_tab_dir, survey_dump_dir]
|
| 214 |
if sum([True for dir in dirs if 'arxiv_data/' in dir]):
|
| 215 |
base = os.path.dirname("arxiv_data/")
|
| 216 |
if not os.path.exists(base):
|
| 217 |
os.mkdir(base)
|
| 218 |
self.clean_dirs(dirs)
|
| 219 |
+
return dirs
|
| 220 |
|
| 221 |
def clean_dirs(self, dirs):
|
| 222 |
import shutil
|
|
|
|
| 1335 |
zipf = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)
|
| 1336 |
zipdir(dump_dir, zipf)
|
| 1337 |
|
| 1338 |
+
def survey(self, query=None, id_list=None, max_search=None, num_papers=None, debug=False, weigh_authors=False,
|
| 1339 |
+
pdf_dir=None, txt_dir=None, img_dir=None, tab_dir=None, dump_dir=None):
|
| 1340 |
import joblib
|
| 1341 |
import os, shutil
|
| 1342 |
+
|
| 1343 |
+
dirs = self.define_structure(pdf_dir=pdf_dir, txt_dir=txt_dir, img_dir=img_dir, tab_dir=tab_dir, dump_dir=dump_dir)
|
| 1344 |
+
[survey_pdf_dir, survey_txt_dir, survey_img_dir, survey_tab_dir, survey_dump_dir] = dirs
|
| 1345 |
+
|
| 1346 |
if not max_search:
|
| 1347 |
max_search = self.DEFAULTS['max_search']
|
| 1348 |
if not num_papers:
|
|
|
|
| 1352 |
# arxiv api relevance search and data preparation
|
| 1353 |
self.print_fn("\n- searching arXiv for top 100 papers.. ")
|
| 1354 |
results, searched_papers = self.search(query, id_list, max_search=max_search)
|
| 1355 |
+
joblib.dump(searched_papers, survey_dump_dir + 'papers_metadata.dmp')
|
| 1356 |
self.print_fn("\n- found " + str(len(searched_papers)) + " papers")
|
| 1357 |
|
| 1358 |
# paper selection by scibert vector embedding relevance scores
|
| 1359 |
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
| 1360 |
|
| 1361 |
+
papers_highlighted, papers_selected, cites = self.pdf_route(survey_pdf_dir, survey_txt_dir, survey_img_dir, survey_tab_dir, survey_dump_dir,
|
| 1362 |
searched_papers)
|
| 1363 |
|
| 1364 |
if weigh_authors:
|
| 1365 |
authors = self.author_stats(papers_highlighted)
|
| 1366 |
|
| 1367 |
+
joblib.dump(papers_highlighted, survey_dump_dir + 'papers_highlighted.dmp')
|
| 1368 |
|
| 1369 |
self.print_fn("\n- Standardizing known section headings per paper.. ")
|
| 1370 |
papers_standardized = self.standardize_headings(papers_highlighted)
|
| 1371 |
+
joblib.dump(papers_standardized, survey_dump_dir + 'papers_standardized.dmp')
|
| 1372 |
|
| 1373 |
self.print_fn("\n- Building paper-wise corpus.. ")
|
| 1374 |
corpus = self.build_corpus(papers_highlighted, searched_papers)
|
| 1375 |
+
joblib.dump(corpus, survey_dump_dir + 'corpus.dmp')
|
| 1376 |
|
| 1377 |
self.print_fn("\n- Building section-wise corpus.. ")
|
| 1378 |
corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
|
| 1379 |
+
joblib.dump(corpus_sectionwise, survey_dump_dir + 'corpus_sectionwise.dmp')
|
| 1380 |
|
| 1381 |
self.print_fn("\n- Building basic research highlights.. ")
|
| 1382 |
research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
|
| 1383 |
+
joblib.dump(research_blocks, survey_dump_dir + 'research_blocks.dmp')
|
| 1384 |
|
| 1385 |
self.print_fn("\n- Reducing corpus to lines.. ")
|
| 1386 |
corpus_lines = self.get_corpus_lines(corpus)
|
| 1387 |
+
joblib.dump(corpus_lines, survey_dump_dir + 'corpus_lines.dmp')
|
| 1388 |
|
| 1389 |
# temp
|
| 1390 |
# searched_papers = joblib.load(dump_dir + 'papers_metadata.dmp')
|
|
|
|
| 1418 |
|
| 1419 |
self.print_fn("\n- Building abstract.. ")
|
| 1420 |
abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
|
| 1421 |
+
joblib.dump(abstract_block, survey_dump_dir + 'abstract_block.dmp')
|
| 1422 |
'''
|
| 1423 |
self.print_fn("abstract_block type:"+ str(type(abstract_block)))
|
| 1424 |
self.print_fn("abstract_block:")
|
|
|
|
| 1427 |
|
| 1428 |
self.print_fn("\n- Building introduction.. ")
|
| 1429 |
intro_block = self.get_intro(corpus_sectionwise, research_blocks)
|
| 1430 |
+
joblib.dump(intro_block, survey_dump_dir + 'intro_block.dmp')
|
| 1431 |
'''
|
| 1432 |
self.print_fn("intro_block type:"+ str(type(intro_block)))
|
| 1433 |
self.print_fn("intro_block:")
|
|
|
|
| 1435 |
'''
|
| 1436 |
self.print_fn("\n- Building custom sections.. ")
|
| 1437 |
clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
|
| 1438 |
+
joblib.dump(clustered_sections, survey_dump_dir + 'clustered_sections.dmp')
|
| 1439 |
+
joblib.dump(clustered_sentences, survey_dump_dir + 'clustered_sentences.dmp')
|
| 1440 |
|
| 1441 |
'''
|
| 1442 |
self.print_fn("clusters extracted")
|
|
|
|
| 1449 |
'''
|
| 1450 |
clustered_sections['abstract'] = abstract_block
|
| 1451 |
clustered_sections['introduction'] = intro_block
|
| 1452 |
+
joblib.dump(clustered_sections, survey_dump_dir + 'research_sections.dmp')
|
| 1453 |
|
| 1454 |
self.print_fn("\n- Building conclusion.. ")
|
| 1455 |
conclusion_block = self.get_conclusion(clustered_sections)
|
| 1456 |
+
joblib.dump(conclusion_block, survey_dump_dir + 'conclusion_block.dmp')
|
| 1457 |
clustered_sections['conclusion'] = conclusion_block
|
| 1458 |
'''
|
| 1459 |
self.print_fn("conclusion_block type:"+ str(type(conclusion_block)))
|
|
|
|
| 1464 |
query = self.generate_title(' '.join([v for v in clustered_sections.values()]))
|
| 1465 |
|
| 1466 |
survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
|
| 1467 |
+
survey_file = Path(survey_dump_dir).resolve() / survey_file
|
| 1468 |
self.build_doc(clustered_sections, papers_standardized, query=query, filename=str(survey_file))
|
| 1469 |
|
| 1470 |
self.survey_print_fn("\n-citation-network: ")
|
| 1471 |
self.survey_print_fn(cites)
|
| 1472 |
|
| 1473 |
+
shutil.copytree('arxiv_data/', survey_dump_dir + '/arxiv_data/')
|
| 1474 |
assert (os.path.exists(survey_file))
|
| 1475 |
|
| 1476 |
zip_name = 'arxiv_dumps_'+query.replace(' ', '_')+'.zip'
|
| 1477 |
+
zip_name = Path(survey_dump_dir).parent.resolve() / zip_name
|
| 1478 |
+
self.zip_outputs(survey_dump_dir, str(zip_name))
|
| 1479 |
self.print_fn("\n- Survey complete.. \nSurvey file path :" + str(survey_file) +
|
| 1480 |
"\nAll outputs zip path :" + str(zip_name))
|
| 1481 |
|