diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 18:47:54 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 18:48:11 -0700 | 
| commit | c71314e46dcf18905d1957579a211bb47c520d57 (patch) | |
| tree | 5f50e19de1e764b8840334c7578b5c3f0b51b367 | |
| parent | 1ccd50eca45667aaf232f3bfb6a5aafadf17fc09 (diff) | |
| download | fatcat-scholar-c71314e46dcf18905d1957579a211bb47c520d57.tar.gz fatcat-scholar-c71314e46dcf18905d1957579a211bb47c520d57.zip | |
local pdftotext cache dir hack
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | fatcat_scholar/work_pipeline.py | 19 | 
2 files changed, 19 insertions, 1 deletions
| @@ -1,4 +1,5 @@  data/ +fulltext_web/  *.o  *.a diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index ebc2923..c93cb29 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -67,11 +67,12 @@ def fulltext_pref_list(releases: List[ReleaseEntity]) -> List[str]:  class WorkPipeline(): -    def __init__(self, issue_db: IssueDB, sandcrawler_db_client: SandcrawlerPostgrestClient, sandcrawler_s3_client: SandcrawlerMinioClient): +    def __init__(self, issue_db: IssueDB, sandcrawler_db_client: SandcrawlerPostgrestClient, sandcrawler_s3_client: SandcrawlerMinioClient, fulltext_cache_dir=None):          self.issue_db: IssueDB = issue_db          self.ia_client = internetarchive.get_session()          self.sandcrawler_db_client = sandcrawler_db_client          self.sandcrawler_s3_client = sandcrawler_s3_client +        self.fulltext_cache_dir = fulltext_cache_dir      def fetch_file_grobid(self, fe: FileEntity, release_ident: str) -> Optional[Any]:          """ @@ -107,6 +108,19 @@ class WorkPipeline():          release_ident: Optional[str]          file_ident: Optional[str]          """ +        # HACK: look for local pdftotext output +        if self.fulltext_cache_dir: +            local_txt_path = f"{self.fulltext_cache_dir}/pdftotext/{fe.sha1[:2]}/{fe.sha1}.txt" +            try: +                with open(local_txt_path, 'r') as txt_file: +                    raw_text = txt_file.read() +                return dict( +                    raw_text=raw_text, +                    release_ident=release_ident, +                    file_ident=fe.ident, +                ) +            except FileNotFoundError: +                pass          return None      def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]: @@ -307,6 +321,9 @@ def main():      sub.add_argument("json_file",          help="release entities, as JSON-lines",          nargs='?', default=sys.stdin, type=argparse.FileType('r')) +    sub.add_argument("--fulltext-cache-dir", +        help="path of local directory with pdftotext fulltext (and thumbnails)", +        default=None, type=str)      args = parser.parse_args()      if not args.__dict__.get("func"): | 
