diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-10-01 17:33:42 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-10-01 17:33:42 -0700 |
commit | 9618d5146eea046342b69895e68b937a056d2816 (patch) | |
tree | e5ff7d221e45206dcc213c9dfc98518c502cc28b /python/fatcat_tools | |
parent | 6e0736cebcb2b1e5ddbae03127572ad9d1ffca49 (diff) | |
download | fatcat-9618d5146eea046342b69895e68b937a056d2816.tar.gz fatcat-9618d5146eea046342b69895e68b937a056d2816.zip |
new SPN web (html) importer
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 106 |
2 files changed, 81 insertions, 27 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 6a2edeac..9cb18506 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -27,7 +27,7 @@ from .orcid import OrcidImporter from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE from .wayback_static import auto_wayback_static from .cdl_dash_dat import auto_cdl_dash_dat -from .ingest import IngestFileResultImporter, SavePaperNowFileImporter, IngestWebResultImporter +from .ingest import IngestFileResultImporter, SavePaperNowFileImporter, IngestWebResultImporter, SavePaperNowWebImporter from .shadow import ShadowLibraryImporter from .file_meta import FileMetaImporter from .doaj_article import DoajArticleImporter diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index fc02058b..bc759219 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -222,6 +222,12 @@ class IngestFileResultImporter(EntityImporter): if edit_extra['link_source'] == 'doi': edit_extra['link_source_id'] = edit_extra['link_source_id'].lower() + # GROBID metadata, for SPN requests (when there might not be 'success') + if request.get('ingest_type') == 'pdf': + if row.get('grobid') and row['grobid'].get('status') != 'success': + edit_extra['grobid_status_code'] = row['grobid']['status_code'] + edit_extra['grobid_version'] = row['grobid'].get('grobid_version') + return edit_extra def parse_record(self, row): @@ -304,11 +310,19 @@ class IngestFileResultImporter(EntityImporter): return False def insert_batch(self, batch): - self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( + if self.submit_mode: + eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + extra=self.editgroup_extra)) + for fe in batch: + self.api.create_file(eg.editgroup_id, fe) + self.api.update_editgroup(eg.editgroup_id, eg, submit=True) + else: + self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) class SavePaperNowFileImporter(IngestFileResultImporter): @@ -350,24 +364,6 @@ class SavePaperNowFileImporter(IngestFileResultImporter): return True - def insert_batch(self, batch): - """ - Usually running in submit_mode, so we can't use auto_batch method - """ - if self.submit_mode: - eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra)) - for fe in batch: - self.api.create_file(eg.editgroup_id, fe) - self.api.update_editgroup(eg.editgroup_id, eg, submit=True) - else: - self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) - class IngestWebResultImporter(IngestFileResultImporter): """ @@ -514,8 +510,66 @@ class IngestWebResultImporter(IngestFileResultImporter): return True def insert_batch(self, batch): - self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( + if self.submit_mode: + eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + extra=self.editgroup_extra)) + for fe in batch: + self.api.create_webcapture(eg.editgroup_id, fe) + self.api.update_editgroup(eg.editgroup_id, eg, submit=True) + else: + self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + +class SavePaperNowWebImporter(IngestWebResultImporter): + """ + Like SavePaperNowFileImporter, but for webcapture (HTML) ingest. + """ + + def __init__(self, api, submit_mode=True, **kwargs): + + eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled after a public 'Save Paper Now' request" + eg_extra = kwargs.pop('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestWebSavePaperNow') + kwargs['submit_mode'] = submit_mode + kwargs['do_updates'] = False + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + def want(self, row): + """ + Relatively custom want() here, a synthesis of other filters. + + We do currently allow unknown-scope through for this specific code + path, which means allowing hit=false. + """ + + source = row['request'].get('ingest_request_source') + if not source: + self.counts['skip-ingest_request_source'] += 1 + return False + if not source.startswith('savepapernow'): + self.counts['skip-not-savepapernow'] += 1 + return False + + # webcapture-specific filters + if row['request'].get('ingest_type') != 'html': + self.counts['skip-ingest-type'] += 1 + return False + if not row.get('file_meta'): + self.counts['skip-file-meta'] += 1 + return False + if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"): + self.counts['skip-mimetype'] += 1 + return False + + if row.get('status') not in ['success', 'unknown-scope']: + self.counts['skip-hit'] += 1 + return False + + return True |