From e9dd3c73f036d3fba2680eeaff8e62ecf2dbf9a1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 10 Aug 2020 15:07:19 -0700 Subject: update crawl blocklist for SPNv2 requests which mostly fail --- python/fatcat_tools/workers/changelog.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index d5891ad1..1ac7a865 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -101,12 +101,20 @@ class EntityUpdatesWorker(FatcatWorker): "10.3932/", # ccdc.cam.ac.uk: crystal structures "10.5517/", + # researchgate: mostly blocks our crawler + "10.13140/", + # springerlink: mostly blocks crawler + "10.1007/", + # nature group: mostly blocks crawler + "10.1038/", + # SAGE: mostly blocks crawler + "10.1177/", + # IOP: mostly blocks crawler + "10.1088/", ] self.live_pdf_ingest_doi_prefix_acceptlist = [ # biorxiv and medrxiv "10.1101/", - # researchgate - "10.13140/", # the lancet (often hybrid OA) "10.1016/s0140-6736", "10.1016/s2213-2600", -- cgit v1.2.3 From 26646b5636767495881965d566e3889ad6d126e7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 10 Aug 2020 17:33:15 -0700 Subject: datacite import: refactor publisher-specific hacks into static method Also tweak title/publisher detection to use DOI prefixes --- python/fatcat_tools/importers/datacite.py | 44 ++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index f93362d6..00ce9ccd 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -493,21 +493,6 @@ class DataciteImporter(EntityImporter): if release_type is None: print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr) - # release_type exception: Global Biodiversity Information Facility - # publishes highly interesting datasets, but titles are mostly the same - # ("GBIF Occurrence Download" or "Occurrence Download"); set - # release_type to "stub" (CSL/FC). - if publisher == 'The Global Biodiversity Information Facility': - release_type = 'stub' - - # release_type exception: lots of "Experimental Crystal Structure Determination" - if publisher == 'Cambridge Crystallographic Data Centre': - release_type = 'entry' - - # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." - if title.lower().startswith('additional file'): - release_type = 'stub' - # Language values are varied ("ger", "es", "English", "ENG", "en-us", # "other", ...). Try to crush it with langcodes: "It may sound to you # like langcodes solves a pretty boring problem. At one level, that's @@ -693,6 +678,35 @@ class DataciteImporter(EntityImporter): license_slug=license_slug, version=version, ) + re = self.biblio_hacks(re) + return re + + @staticmethod + def biblio_hacks(re): + """ + This function handles known special cases. For example, + publisher-specific or platform-specific workarounds. + """ + + # only runs on datacite entities with a DOI + assert re.ext_ids.doi + + # release_type exception: Global Biodiversity Information Facility + # publishes highly interesting datasets, but titles are mostly the same + # ("GBIF Occurrence Download" or "Occurrence Download"); set + # release_type to "stub" (CSL/FC). + if re.title == 'GBIF Occurrence Download' and re.ext_ids.doi.startswith('10.15468/dl.'): + re.release_type = 'stub' + + # release_type exception: lots of "Experimental Crystal Structure Determination" + # publisher: "Cambridge Crystallographic Data Centre" + if re.ext_ids.doi.startswith('10.5517/'): + re.release_type = 'entry' + + # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." + if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'): + re.release_type = 'component' + return re def try_update(self, re): -- cgit v1.2.3 From 211ef075f5ac2960fa09134043a8246270d99baf Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 10 Aug 2020 17:34:50 -0700 Subject: datacite import: refactor release_type detection into static method --- python/fatcat_tools/importers/datacite.py | 65 ++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 14 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 00ce9ccd..0481337a 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -478,20 +478,7 @@ class DataciteImporter(EntityImporter): license_slug = slug license_extra.append(lic) - # Release type. Try to determine the release type from a variety of - # types supplied in datacite. The "attributes.types.resourceType" is - # uncontrolled (170000+ unique values, from "null", "Dataset" to - # "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP - # flows in 2009") citeproc may be the closest, but not always supplied. - # Order lookup roughly by completeness of mapping. - for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'): - value = attributes.get('types', {}).get(typeType) - release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) - if release_type is not None: - break - - if release_type is None: - print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr) + release_type = self.datacite_release_type(doi, attributes) # Language values are varied ("ger", "es", "English", "ENG", "en-us", # "other", ...). Try to crush it with langcodes: "It may sound to you @@ -681,6 +668,38 @@ class DataciteImporter(EntityImporter): re = self.biblio_hacks(re) return re + @staticmethod + def datacite_release_type(doi, attributes): + """ + Release type. Try to determine the release type from a variety of types + supplied in datacite. The "attributes.types.resourceType" is + uncontrolled (170000+ unique values, from "null", "Dataset" to "Jupyter + Notebook" and "Macroseismic Data Points" or "2 days of IP flows in + 2009") citeproc may be the closest, but not always supplied. Order + lookup roughly by completeness of mapping. + """ + + release_type = None + if not attributes.get('types'): + return None + types = attributes['types'] + + for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'): + value = types.get(typeType) + release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) + if release_type is not None: + break + + # special case: figshare "collections" which group other entities + if doi.startswith('10.6084/') or doi.startswith('10.25384'): + if types.get('resourceType') == "Collection": + release_type = "stub" + + if release_type is None: + print("[{}] no mapped type: {}".format(doi, types), file=sys.stderr) + + return release_type + @staticmethod def biblio_hacks(re): """ @@ -707,6 +726,24 @@ class DataciteImporter(EntityImporter): if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'): re.release_type = 'component' + # figshare + if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'): + # set version if DOI ends with versioned suffix + doi_suffix = re.ext_ids.doi.split('.') + if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit(): + re.version = doi_suffix + # "Figure 123 from " -> component + # "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean" + if " from " in re.title and re.release_type not in ('stub', 'graphic'): + if re.title.startswith("Figure "): + re.release_type = "component" + elif re.title.startswith("Table "): + re.release_type = "component" + + # figshare.com + if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.container_name is None: + re.container_name = "figshare.com" + return re def try_update(self, re): -- cgit v1.2.3 From ff05a03a3874e17557174d3534a1c2d11e01c4a6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 10 Aug 2020 17:35:16 -0700 Subject: datacite import: figshare-specific hacks --- python/fatcat_tools/importers/datacite.py | 6 +++--- python/tests/files/datacite/datacite_result_16.json | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 0481337a..6c050565 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -729,7 +729,7 @@ class DataciteImporter(EntityImporter): # figshare if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'): # set version if DOI ends with versioned suffix - doi_suffix = re.ext_ids.doi.split('.') + doi_suffix = re.ext_ids.doi.split('.')[-1] if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit(): re.version = doi_suffix # "Figure 123 from " -> component @@ -741,8 +741,8 @@ class DataciteImporter(EntityImporter): re.release_type = "component" # figshare.com - if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.container_name is None: - re.container_name = "figshare.com" + if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.extra.get('container_name') is None: + re.extra['container_name'] = "figshare.com" return re diff --git a/python/tests/files/datacite/datacite_result_16.json b/python/tests/files/datacite/datacite_result_16.json index ef26c47c..d0b933d1 100644 --- a/python/tests/files/datacite/datacite_result_16.json +++ b/python/tests/files/datacite/datacite_result_16.json @@ -13,6 +13,7 @@ "doi": "10.6084/m9.figshare.1282478" }, "extra": { + "container_name": "figshare.com", "datacite": { "license": [ { -- cgit v1.2.3 From 2a492914082444690f853a55ab1394fc0cf50108 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 11 Aug 2020 14:52:47 -0700 Subject: entity update: skip ingest of figshare+zenodo 'group' DOIs --- python/fatcat_tools/workers/changelog.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 1ac7a865..dc5ef299 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -175,6 +175,7 @@ class EntityUpdatesWorker(FatcatWorker): 'paper-conference', 'patent', 'peer_review', + 'post', 'report', 'retraction', 'review', @@ -217,6 +218,20 @@ class EntityUpdatesWorker(FatcatWorker): if doi.startswith(prefix): return False + # figshare + if doi and doi.startswith('10.6084/') or doi.startswith('10.25384/'): + # don't crawl "most recent version" (aka "group") DOIs + if not release.version: + return False + + # zenodo + if doi and doi.startswith('10.5281/'): + # if this is a "grouping" DOI of multiple "version" DOIs, do not crawl (will crawl the versioned DOIs) + if release.extra and release.extra.get('relations'): + for rel in release.extra['relations']: + if (rel.get('relationType') == 'HasVersion' and rel.get('relatedIdentifier', '').startswith('10.5281/')): + return False + return True def run(self): -- cgit v1.2.3 From 5eddc9b9aefbd7ae197d441b8a7af1fded940e2d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 11 Aug 2020 15:23:25 -0700 Subject: entity update: default to ingest non-OA works --- python/fatcat_tools/workers/changelog.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index dc5ef299..f7df6748 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -89,7 +89,7 @@ class EntityUpdatesWorker(FatcatWorker): self.ingest_file_request_topic = ingest_file_request_topic self.poll_interval = poll_interval self.consumer_group = "entity-updates" - self.ingest_oa_only = True + self.ingest_oa_only = False self.ingest_pdf_doi_prefix_blocklist = [ # gbif.org: many DOIs, not PDF fulltext "10.15468/", @@ -191,15 +191,16 @@ class EntityUpdatesWorker(FatcatWorker): 'stub', ) - # accept list sets a default "crawl it" despite OA metadata for - # known-OA DOI prefixes - in_acceptlist = False - if doi: - for prefix in self.live_pdf_ingest_doi_prefix_acceptlist: - if doi.startswith(prefix): - in_acceptlist = True - if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'): + + # accept list sets a default "crawl it" despite OA metadata for + # known-OA DOI prefixes + in_acceptlist = False + if doi: + for prefix in self.live_pdf_ingest_doi_prefix_acceptlist: + if doi.startswith(prefix): + in_acceptlist = True + es = release_to_elasticsearch(release) # most datacite documents are in IRs and should be crawled is_datacite_doc = False -- cgit v1.2.3 From 03d2004717d36962aef1bd373d59ce799d7db9ab Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 11 Aug 2020 15:45:36 -0700 Subject: entity update: change big5 ingest behavior In addition to changing the OA default, this was the main intended behavior change in this group of commits: want to ingest fewer attempts that we *expect* to fail, but default to ingest/crawl attempt if we are uncertain. This is because there is a long tail of journals that register DOIs and are defacto OA (fulltext is available), but we don't have metadata indicating them as such. --- python/fatcat_tools/workers/changelog.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index f7df6748..65a8fcd8 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -159,6 +159,7 @@ class EntityUpdatesWorker(FatcatWorker): link_source = ingest_request.get('ingest_request') ingest_type = ingest_request.get('ingest_type') doi = ingest_request.get('ext_ids', {}).get('doi') + es = release_to_elasticsearch(release) is_document = release.release_type in ( 'article', @@ -191,17 +192,16 @@ class EntityUpdatesWorker(FatcatWorker): 'stub', ) - if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'): + # accept list sets a default "crawl it" despite OA metadata for + # known-OA DOI prefixes + in_acceptlist = False + if doi: + for prefix in self.live_pdf_ingest_doi_prefix_acceptlist: + if doi.startswith(prefix): + in_acceptlist = True - # accept list sets a default "crawl it" despite OA metadata for - # known-OA DOI prefixes - in_acceptlist = False - if doi: - for prefix in self.live_pdf_ingest_doi_prefix_acceptlist: - if doi.startswith(prefix): - in_acceptlist = True + if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'): - es = release_to_elasticsearch(release) # most datacite documents are in IRs and should be crawled is_datacite_doc = False if release.extra and ('datacite' in release.extra) and is_document: @@ -209,6 +209,12 @@ class EntityUpdatesWorker(FatcatWorker): if not (es['is_oa'] or in_acceptlist or is_datacite_doc): return False + # big publishers *generally* have accurate OA metadata, use + # preservation networks, and block our crawlers. So unless OA, or + # explicitly on accept list, or not preserved, skip crawling + if es['publisher_type'] == 'big5' and es['is_preserved'] and not (es['is_oa'] or in_acceptlist): + return False + # if ingest_type is pdf but release_type is almost certainly not a PDF, # skip it. This is mostly a datacite thing. if ingest_type == "pdf" and is_not_pdf: -- cgit v1.2.3