diff options
author | bnewbold <bnewbold@archive.org> | 2020-02-25 05:09:09 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2020-02-25 05:09:09 +0000 |
commit | 1556cdd7f0f5f4bc4fe5ccc9764c1598c852bb9b (patch) | |
tree | 7f31e907be31ecb9c1ef1f6fe94f8f6f17ae86ac /python/fatcat_tools/workers | |
parent | bd6fe91108f7f99f7a88a6b33d9323ffc4d8a6e6 (diff) | |
parent | dc4116f8ebd225eb4af1cecfc75f5c1291589694 (diff) | |
download | fatcat-1556cdd7f0f5f4bc4fe5ccc9764c1598c852bb9b.tar.gz fatcat-1556cdd7f0f5f4bc4fe5ccc9764c1598c852bb9b.zip |
Merge branch 'bnewbold-more-ingest' into 'master'
entity worker: ingest more Datacite releases; filter some out
See merge request webgroup/fatcat!29
Diffstat (limited to 'python/fatcat_tools/workers')
-rw-r--r-- | python/fatcat_tools/workers/changelog.py | 38 |
1 files changed, 37 insertions, 1 deletions
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 745ee85a..b84d5e70 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -123,6 +123,33 @@ class EntityUpdatesWorker(FatcatWorker): ingest_type = ingest_request.get('ingest_type') doi = ingest_request.get('ext_ids', {}).get('doi') + is_document = release.release_type in ( + 'article-journal', + 'paper-conference', + 'article', + 'report', + 'chapter', + 'manuscript', + 'review', + 'thesis', + 'letter', + 'editorial', + 'abstract', + 'entry', + 'patent', + 'post', + 'review-book', + ) + is_not_pdf = release.release_type in ( + 'dataset', + 'stub', + 'software', + 'figure', + 'graphic', + ) + + # accept list sets a default "crawl it" despite OA metadata for + # known-OA DOI prefixes in_acceptlist = False if doi: for prefix in self.live_pdf_ingest_doi_prefix_acceptlist: @@ -131,9 +158,18 @@ class EntityUpdatesWorker(FatcatWorker): if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'): es = release_to_elasticsearch(release) - if not es['is_oa'] and not in_acceptlist: + # most datacite documents are in IRs and should be crawled + is_datacite_doc = False + if release.extra and ('datacite' in release.extra) and is_document: + is_datacite_doc = True + if not (es['is_oa'] or in_acceptlist or is_datacite_doc): return False + # if ingest_type is pdf but release_type is almost certainly not a PDF, + # skip it. This is mostly a datacite thing. + if ingest_type == "pdf" and is_not_pdf: + return False + if ingest_type == "pdf" and doi: for prefix in self.ingest_pdf_doi_prefix_blocklist: if doi.startswith(prefix): |