diff options
author | Martin Czygan <martin@archive.org> | 2020-08-13 18:38:29 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2020-08-13 18:38:29 +0000 |
commit | b2bb070a161b4e4b05ab51ab4ab7bae6d1290e4a (patch) | |
tree | c758be171e8367bd9f4765f576c7c3c19cebdb6a /python/fatcat_tools/workers | |
parent | 1f75aa4cd10947f725eb3db2a51377579a09eb01 (diff) | |
parent | 03d2004717d36962aef1bd373d59ce799d7db9ab (diff) | |
download | fatcat-b2bb070a161b4e4b05ab51ab4ab7bae6d1290e4a.tar.gz fatcat-b2bb070a161b4e4b05ab51ab4ab7bae6d1290e4a.zip |
Merge branch 'bnewbold-ingest-improvements' into 'master'
ingest behavior changes; some datacite metadata tweaks
See merge request webgroup/fatcat!78
Diffstat (limited to 'python/fatcat_tools/workers')
-rw-r--r-- | python/fatcat_tools/workers/changelog.py | 38 |
1 files changed, 34 insertions, 4 deletions
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index d5891ad1..65a8fcd8 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -89,7 +89,7 @@ class EntityUpdatesWorker(FatcatWorker): self.ingest_file_request_topic = ingest_file_request_topic self.poll_interval = poll_interval self.consumer_group = "entity-updates" - self.ingest_oa_only = True + self.ingest_oa_only = False self.ingest_pdf_doi_prefix_blocklist = [ # gbif.org: many DOIs, not PDF fulltext "10.15468/", @@ -101,12 +101,20 @@ class EntityUpdatesWorker(FatcatWorker): "10.3932/", # ccdc.cam.ac.uk: crystal structures "10.5517/", + # researchgate: mostly blocks our crawler + "10.13140/", + # springerlink: mostly blocks crawler + "10.1007/", + # nature group: mostly blocks crawler + "10.1038/", + # SAGE: mostly blocks crawler + "10.1177/", + # IOP: mostly blocks crawler + "10.1088/", ] self.live_pdf_ingest_doi_prefix_acceptlist = [ # biorxiv and medrxiv "10.1101/", - # researchgate - "10.13140/", # the lancet (often hybrid OA) "10.1016/s0140-6736", "10.1016/s2213-2600", @@ -151,6 +159,7 @@ class EntityUpdatesWorker(FatcatWorker): link_source = ingest_request.get('ingest_request') ingest_type = ingest_request.get('ingest_type') doi = ingest_request.get('ext_ids', {}).get('doi') + es = release_to_elasticsearch(release) is_document = release.release_type in ( 'article', @@ -167,6 +176,7 @@ class EntityUpdatesWorker(FatcatWorker): 'paper-conference', 'patent', 'peer_review', + 'post', 'report', 'retraction', 'review', @@ -191,7 +201,7 @@ class EntityUpdatesWorker(FatcatWorker): in_acceptlist = True if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'): - es = release_to_elasticsearch(release) + # most datacite documents are in IRs and should be crawled is_datacite_doc = False if release.extra and ('datacite' in release.extra) and is_document: @@ -199,6 +209,12 @@ class EntityUpdatesWorker(FatcatWorker): if not (es['is_oa'] or in_acceptlist or is_datacite_doc): return False + # big publishers *generally* have accurate OA metadata, use + # preservation networks, and block our crawlers. So unless OA, or + # explicitly on accept list, or not preserved, skip crawling + if es['publisher_type'] == 'big5' and es['is_preserved'] and not (es['is_oa'] or in_acceptlist): + return False + # if ingest_type is pdf but release_type is almost certainly not a PDF, # skip it. This is mostly a datacite thing. if ingest_type == "pdf" and is_not_pdf: @@ -209,6 +225,20 @@ class EntityUpdatesWorker(FatcatWorker): if doi.startswith(prefix): return False + # figshare + if doi and doi.startswith('10.6084/') or doi.startswith('10.25384/'): + # don't crawl "most recent version" (aka "group") DOIs + if not release.version: + return False + + # zenodo + if doi and doi.startswith('10.5281/'): + # if this is a "grouping" DOI of multiple "version" DOIs, do not crawl (will crawl the versioned DOIs) + if release.extra and release.extra.get('relations'): + for rel in release.extra['relations']: + if (rel.get('relationType') == 'HasVersion' and rel.get('relatedIdentifier', '').startswith('10.5281/')): + return False + return True def run(self): |