diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 18:47:54 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 18:48:11 -0700 |
commit | c71314e46dcf18905d1957579a211bb47c520d57 (patch) | |
tree | 5f50e19de1e764b8840334c7578b5c3f0b51b367 | |
parent | 1ccd50eca45667aaf232f3bfb6a5aafadf17fc09 (diff) | |
download | fatcat-scholar-c71314e46dcf18905d1957579a211bb47c520d57.tar.gz fatcat-scholar-c71314e46dcf18905d1957579a211bb47c520d57.zip |
local pdftotext cache dir hack
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 19 |
2 files changed, 19 insertions, 1 deletions
@@ -1,4 +1,5 @@ data/ +fulltext_web/ *.o *.a diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index ebc2923..c93cb29 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -67,11 +67,12 @@ def fulltext_pref_list(releases: List[ReleaseEntity]) -> List[str]: class WorkPipeline(): - def __init__(self, issue_db: IssueDB, sandcrawler_db_client: SandcrawlerPostgrestClient, sandcrawler_s3_client: SandcrawlerMinioClient): + def __init__(self, issue_db: IssueDB, sandcrawler_db_client: SandcrawlerPostgrestClient, sandcrawler_s3_client: SandcrawlerMinioClient, fulltext_cache_dir=None): self.issue_db: IssueDB = issue_db self.ia_client = internetarchive.get_session() self.sandcrawler_db_client = sandcrawler_db_client self.sandcrawler_s3_client = sandcrawler_s3_client + self.fulltext_cache_dir = fulltext_cache_dir def fetch_file_grobid(self, fe: FileEntity, release_ident: str) -> Optional[Any]: """ @@ -107,6 +108,19 @@ class WorkPipeline(): release_ident: Optional[str] file_ident: Optional[str] """ + # HACK: look for local pdftotext output + if self.fulltext_cache_dir: + local_txt_path = f"{self.fulltext_cache_dir}/pdftotext/{fe.sha1[:2]}/{fe.sha1}.txt" + try: + with open(local_txt_path, 'r') as txt_file: + raw_text = txt_file.read() + return dict( + raw_text=raw_text, + release_ident=release_ident, + file_ident=fe.ident, + ) + except FileNotFoundError: + pass return None def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]: @@ -307,6 +321,9 @@ def main(): sub.add_argument("json_file", help="release entities, as JSON-lines", nargs='?', default=sys.stdin, type=argparse.FileType('r')) + sub.add_argument("--fulltext-cache-dir", + help="path of local directory with pdftotext fulltext (and thumbnails)", + default=None, type=str) args = parser.parse_args() if not args.__dict__.get("func"): |