aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorMartin Czygan <martin@archive.org>2020-08-13 18:38:29 +0000
committerMartin Czygan <martin@archive.org>2020-08-13 18:38:29 +0000
commitb2bb070a161b4e4b05ab51ab4ab7bae6d1290e4a (patch)
treec758be171e8367bd9f4765f576c7c3c19cebdb6a /python/fatcat_tools
parent1f75aa4cd10947f725eb3db2a51377579a09eb01 (diff)
parent03d2004717d36962aef1bd373d59ce799d7db9ab (diff)
downloadfatcat-b2bb070a161b4e4b05ab51ab4ab7bae6d1290e4a.tar.gz
fatcat-b2bb070a161b4e4b05ab51ab4ab7bae6d1290e4a.zip
Merge branch 'bnewbold-ingest-improvements' into 'master'
ingest behavior changes; some datacite metadata tweaks See merge request webgroup/fatcat!78
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/datacite.py109
-rw-r--r--python/fatcat_tools/workers/changelog.py38
2 files changed, 114 insertions, 33 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index f93362d6..6c050565 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -478,35 +478,7 @@ class DataciteImporter(EntityImporter):
license_slug = slug
license_extra.append(lic)
- # Release type. Try to determine the release type from a variety of
- # types supplied in datacite. The "attributes.types.resourceType" is
- # uncontrolled (170000+ unique values, from "null", "Dataset" to
- # "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP
- # flows in 2009") citeproc may be the closest, but not always supplied.
- # Order lookup roughly by completeness of mapping.
- for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'):
- value = attributes.get('types', {}).get(typeType)
- release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
- if release_type is not None:
- break
-
- if release_type is None:
- print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr)
-
- # release_type exception: Global Biodiversity Information Facility
- # publishes highly interesting datasets, but titles are mostly the same
- # ("GBIF Occurrence Download" or "Occurrence Download"); set
- # release_type to "stub" (CSL/FC).
- if publisher == 'The Global Biodiversity Information Facility':
- release_type = 'stub'
-
- # release_type exception: lots of "Experimental Crystal Structure Determination"
- if publisher == 'Cambridge Crystallographic Data Centre':
- release_type = 'entry'
-
- # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire."
- if title.lower().startswith('additional file'):
- release_type = 'stub'
+ release_type = self.datacite_release_type(doi, attributes)
# Language values are varied ("ger", "es", "English", "ENG", "en-us",
# "other", ...). Try to crush it with langcodes: "It may sound to you
@@ -693,6 +665,85 @@ class DataciteImporter(EntityImporter):
license_slug=license_slug,
version=version,
)
+ re = self.biblio_hacks(re)
+ return re
+
+ @staticmethod
+ def datacite_release_type(doi, attributes):
+ """
+ Release type. Try to determine the release type from a variety of types
+ supplied in datacite. The "attributes.types.resourceType" is
+ uncontrolled (170000+ unique values, from "null", "Dataset" to "Jupyter
+ Notebook" and "Macroseismic Data Points" or "2 days of IP flows in
+ 2009") citeproc may be the closest, but not always supplied. Order
+ lookup roughly by completeness of mapping.
+ """
+
+ release_type = None
+ if not attributes.get('types'):
+ return None
+ types = attributes['types']
+
+ for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'):
+ value = types.get(typeType)
+ release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
+ if release_type is not None:
+ break
+
+ # special case: figshare "collections" which group other entities
+ if doi.startswith('10.6084/') or doi.startswith('10.25384'):
+ if types.get('resourceType') == "Collection":
+ release_type = "stub"
+
+ if release_type is None:
+ print("[{}] no mapped type: {}".format(doi, types), file=sys.stderr)
+
+ return release_type
+
+ @staticmethod
+ def biblio_hacks(re):
+ """
+ This function handles known special cases. For example,
+ publisher-specific or platform-specific workarounds.
+ """
+
+ # only runs on datacite entities with a DOI
+ assert re.ext_ids.doi
+
+ # release_type exception: Global Biodiversity Information Facility
+ # publishes highly interesting datasets, but titles are mostly the same
+ # ("GBIF Occurrence Download" or "Occurrence Download"); set
+ # release_type to "stub" (CSL/FC).
+ if re.title == 'GBIF Occurrence Download' and re.ext_ids.doi.startswith('10.15468/dl.'):
+ re.release_type = 'stub'
+
+ # release_type exception: lots of "Experimental Crystal Structure Determination"
+ # publisher: "Cambridge Crystallographic Data Centre"
+ if re.ext_ids.doi.startswith('10.5517/'):
+ re.release_type = 'entry'
+
+ # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire."
+ if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'):
+ re.release_type = 'component'
+
+ # figshare
+ if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'):
+ # set version if DOI ends with versioned suffix
+ doi_suffix = re.ext_ids.doi.split('.')[-1]
+ if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit():
+ re.version = doi_suffix
+ # "Figure 123 from " -> component
+ # "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean"
+ if " from " in re.title and re.release_type not in ('stub', 'graphic'):
+ if re.title.startswith("Figure "):
+ re.release_type = "component"
+ elif re.title.startswith("Table "):
+ re.release_type = "component"
+
+ # figshare.com
+ if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.extra.get('container_name') is None:
+ re.extra['container_name'] = "figshare.com"
+
return re
def try_update(self, re):
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index d5891ad1..65a8fcd8 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -89,7 +89,7 @@ class EntityUpdatesWorker(FatcatWorker):
self.ingest_file_request_topic = ingest_file_request_topic
self.poll_interval = poll_interval
self.consumer_group = "entity-updates"
- self.ingest_oa_only = True
+ self.ingest_oa_only = False
self.ingest_pdf_doi_prefix_blocklist = [
# gbif.org: many DOIs, not PDF fulltext
"10.15468/",
@@ -101,12 +101,20 @@ class EntityUpdatesWorker(FatcatWorker):
"10.3932/",
# ccdc.cam.ac.uk: crystal structures
"10.5517/",
+ # researchgate: mostly blocks our crawler
+ "10.13140/",
+ # springerlink: mostly blocks crawler
+ "10.1007/",
+ # nature group: mostly blocks crawler
+ "10.1038/",
+ # SAGE: mostly blocks crawler
+ "10.1177/",
+ # IOP: mostly blocks crawler
+ "10.1088/",
]
self.live_pdf_ingest_doi_prefix_acceptlist = [
# biorxiv and medrxiv
"10.1101/",
- # researchgate
- "10.13140/",
# the lancet (often hybrid OA)
"10.1016/s0140-6736",
"10.1016/s2213-2600",
@@ -151,6 +159,7 @@ class EntityUpdatesWorker(FatcatWorker):
link_source = ingest_request.get('ingest_request')
ingest_type = ingest_request.get('ingest_type')
doi = ingest_request.get('ext_ids', {}).get('doi')
+ es = release_to_elasticsearch(release)
is_document = release.release_type in (
'article',
@@ -167,6 +176,7 @@ class EntityUpdatesWorker(FatcatWorker):
'paper-conference',
'patent',
'peer_review',
+ 'post',
'report',
'retraction',
'review',
@@ -191,7 +201,7 @@ class EntityUpdatesWorker(FatcatWorker):
in_acceptlist = True
if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
- es = release_to_elasticsearch(release)
+
# most datacite documents are in IRs and should be crawled
is_datacite_doc = False
if release.extra and ('datacite' in release.extra) and is_document:
@@ -199,6 +209,12 @@ class EntityUpdatesWorker(FatcatWorker):
if not (es['is_oa'] or in_acceptlist or is_datacite_doc):
return False
+ # big publishers *generally* have accurate OA metadata, use
+ # preservation networks, and block our crawlers. So unless OA, or
+ # explicitly on accept list, or not preserved, skip crawling
+ if es['publisher_type'] == 'big5' and es['is_preserved'] and not (es['is_oa'] or in_acceptlist):
+ return False
+
# if ingest_type is pdf but release_type is almost certainly not a PDF,
# skip it. This is mostly a datacite thing.
if ingest_type == "pdf" and is_not_pdf:
@@ -209,6 +225,20 @@ class EntityUpdatesWorker(FatcatWorker):
if doi.startswith(prefix):
return False
+ # figshare
+ if doi and doi.startswith('10.6084/') or doi.startswith('10.25384/'):
+ # don't crawl "most recent version" (aka "group") DOIs
+ if not release.version:
+ return False
+
+ # zenodo
+ if doi and doi.startswith('10.5281/'):
+ # if this is a "grouping" DOI of multiple "version" DOIs, do not crawl (will crawl the versioned DOIs)
+ if release.extra and release.extra.get('relations'):
+ for rel in release.extra['relations']:
+ if (rel.get('relationType') == 'HasVersion' and rel.get('relatedIdentifier', '').startswith('10.5281/')):
+ return False
+
return True
def run(self):