From e9dd3c73f036d3fba2680eeaff8e62ecf2dbf9a1 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Mon, 10 Aug 2020 15:07:19 -0700
Subject: update crawl blocklist for SPNv2 requests which mostly fail

---
 python/fatcat_tools/workers/changelog.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'python/fatcat_tools')

diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index d5891ad1..1ac7a865 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -101,12 +101,20 @@ class EntityUpdatesWorker(FatcatWorker):
             "10.3932/",
             # ccdc.cam.ac.uk: crystal structures
             "10.5517/",
+            # researchgate: mostly blocks our crawler
+            "10.13140/",
+            # springerlink: mostly blocks crawler
+            "10.1007/",
+            # nature group: mostly blocks crawler
+            "10.1038/",
+            # SAGE: mostly blocks crawler
+            "10.1177/",
+            # IOP: mostly blocks crawler
+            "10.1088/",
         ]
         self.live_pdf_ingest_doi_prefix_acceptlist = [
             # biorxiv and medrxiv
             "10.1101/",
-            # researchgate
-            "10.13140/",
             # the lancet (often hybrid OA)
             "10.1016/s0140-6736",
             "10.1016/s2213-2600",
-- 
cgit v1.2.3


From 26646b5636767495881965d566e3889ad6d126e7 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Mon, 10 Aug 2020 17:33:15 -0700
Subject: datacite import: refactor publisher-specific hacks into static method

Also tweak title/publisher detection to use DOI prefixes
---
 python/fatcat_tools/importers/datacite.py | 44 ++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 15 deletions(-)

(limited to 'python/fatcat_tools')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index f93362d6..00ce9ccd 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -493,21 +493,6 @@ class DataciteImporter(EntityImporter):
         if release_type is None:
             print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr)
 
-        # release_type exception: Global Biodiversity Information Facility
-        # publishes highly interesting datasets, but titles are mostly the same
-        # ("GBIF Occurrence Download" or "Occurrence Download"); set
-        # release_type to "stub" (CSL/FC).
-        if publisher == 'The Global Biodiversity Information Facility':
-            release_type = 'stub'
-
-        # release_type exception: lots of "Experimental Crystal Structure Determination"
-        if publisher == 'Cambridge Crystallographic Data Centre':
-            release_type = 'entry'
-
-        # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire."
-        if title.lower().startswith('additional file'):
-            release_type = 'stub'
-
         # Language values are varied ("ger", "es", "English", "ENG", "en-us",
         # "other", ...). Try to crush it with langcodes: "It may sound to you
         # like langcodes solves a pretty boring problem. At one level, that's
@@ -693,6 +678,35 @@ class DataciteImporter(EntityImporter):
             license_slug=license_slug,
             version=version,
         )
+        re = self.biblio_hacks(re)
+        return re
+
+    @staticmethod
+    def biblio_hacks(re):
+        """
+        This function handles known special cases. For example,
+        publisher-specific or platform-specific workarounds.
+        """
+
+        # only runs on datacite entities with a DOI
+        assert re.ext_ids.doi
+
+        # release_type exception: Global Biodiversity Information Facility
+        # publishes highly interesting datasets, but titles are mostly the same
+        # ("GBIF Occurrence Download" or "Occurrence Download"); set
+        # release_type to "stub" (CSL/FC).
+        if re.title == 'GBIF Occurrence Download' and re.ext_ids.doi.startswith('10.15468/dl.'):
+            re.release_type = 'stub'
+
+        # release_type exception: lots of "Experimental Crystal Structure Determination"
+        # publisher: "Cambridge Crystallographic Data Centre"
+        if re.ext_ids.doi.startswith('10.5517/'):
+            re.release_type = 'entry'
+
+        # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire."
+        if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'):
+            re.release_type = 'component'
+
         return re
 
     def try_update(self, re):
-- 
cgit v1.2.3


From 211ef075f5ac2960fa09134043a8246270d99baf Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Mon, 10 Aug 2020 17:34:50 -0700
Subject: datacite import: refactor release_type detection into static method

---
 python/fatcat_tools/importers/datacite.py | 65 ++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 14 deletions(-)

(limited to 'python/fatcat_tools')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 00ce9ccd..0481337a 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -478,20 +478,7 @@ class DataciteImporter(EntityImporter):
                 license_slug = slug
             license_extra.append(lic)
 
-        # Release type. Try to determine the release type from a variety of
-        # types supplied in datacite. The "attributes.types.resourceType" is
-        # uncontrolled (170000+ unique values, from "null", "Dataset" to
-        # "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP
-        # flows in 2009") citeproc may be the closest, but not always supplied.
-        # Order lookup roughly by completeness of mapping.
-        for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'):
-            value = attributes.get('types', {}).get(typeType)
-            release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
-            if release_type is not None:
-                break
-
-        if release_type is None:
-            print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr)
+        release_type = self.datacite_release_type(doi, attributes)
 
         # Language values are varied ("ger", "es", "English", "ENG", "en-us",
         # "other", ...). Try to crush it with langcodes: "It may sound to you
@@ -681,6 +668,38 @@ class DataciteImporter(EntityImporter):
         re = self.biblio_hacks(re)
         return re
 
+    @staticmethod
+    def datacite_release_type(doi, attributes):
+        """
+        Release type. Try to determine the release type from a variety of types
+        supplied in datacite. The "attributes.types.resourceType" is
+        uncontrolled (170000+ unique values, from "null", "Dataset" to "Jupyter
+        Notebook" and "Macroseismic Data Points" or "2 days of IP flows in
+        2009") citeproc may be the closest, but not always supplied.  Order
+        lookup roughly by completeness of mapping.
+        """
+
+        release_type = None
+        if not attributes.get('types'):
+            return None
+        types = attributes['types']
+
+        for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'):
+            value = types.get(typeType)
+            release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
+            if release_type is not None:
+                break
+
+        # special case: figshare "collections" which group other entities
+        if doi.startswith('10.6084/') or doi.startswith('10.25384'):
+            if types.get('resourceType') == "Collection":
+                release_type = "stub"
+
+        if release_type is None:
+            print("[{}] no mapped type: {}".format(doi, types), file=sys.stderr)
+
+        return release_type
+
     @staticmethod
     def biblio_hacks(re):
         """
@@ -707,6 +726,24 @@ class DataciteImporter(EntityImporter):
         if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'):
             re.release_type = 'component'
 
+        # figshare
+        if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'):
+            # set version if DOI ends with versioned suffix
+            doi_suffix = re.ext_ids.doi.split('.')
+            if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit():
+                re.version = doi_suffix
+            # "Figure 123 from " -> component
+            # "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean"
+            if " from " in re.title and re.release_type not in ('stub', 'graphic'):
+                if re.title.startswith("Figure "):
+                    re.release_type = "component"
+                elif re.title.startswith("Table "):
+                    re.release_type = "component"
+
+        # figshare.com
+        if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.container_name is None:
+            re.container_name = "figshare.com"
+
         return re
 
     def try_update(self, re):
-- 
cgit v1.2.3


From ff05a03a3874e17557174d3534a1c2d11e01c4a6 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Mon, 10 Aug 2020 17:35:16 -0700
Subject: datacite import: figshare-specific hacks

---
 python/fatcat_tools/importers/datacite.py           | 6 +++---
 python/tests/files/datacite/datacite_result_16.json | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'python/fatcat_tools')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 0481337a..6c050565 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -729,7 +729,7 @@ class DataciteImporter(EntityImporter):
         # figshare
         if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'):
             # set version if DOI ends with versioned suffix
-            doi_suffix = re.ext_ids.doi.split('.')
+            doi_suffix = re.ext_ids.doi.split('.')[-1]
             if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit():
                 re.version = doi_suffix
             # "Figure 123 from " -> component
@@ -741,8 +741,8 @@ class DataciteImporter(EntityImporter):
                     re.release_type = "component"
 
         # figshare.com
-        if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.container_name is None:
-            re.container_name = "figshare.com"
+        if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.extra.get('container_name') is None:
+            re.extra['container_name'] = "figshare.com"
 
         return re
 
diff --git a/python/tests/files/datacite/datacite_result_16.json b/python/tests/files/datacite/datacite_result_16.json
index ef26c47c..d0b933d1 100644
--- a/python/tests/files/datacite/datacite_result_16.json
+++ b/python/tests/files/datacite/datacite_result_16.json
@@ -13,6 +13,7 @@
     "doi": "10.6084/m9.figshare.1282478"
   },
   "extra": {
+    "container_name": "figshare.com",
     "datacite": {
       "license": [
         {
-- 
cgit v1.2.3


From 2a492914082444690f853a55ab1394fc0cf50108 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 11 Aug 2020 14:52:47 -0700
Subject: entity update: skip ingest of figshare+zenodo 'group' DOIs

---
 python/fatcat_tools/workers/changelog.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'python/fatcat_tools')

diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index 1ac7a865..dc5ef299 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -175,6 +175,7 @@ class EntityUpdatesWorker(FatcatWorker):
             'paper-conference',
             'patent',
             'peer_review',
+            'post',
             'report',
             'retraction',
             'review',
@@ -217,6 +218,20 @@ class EntityUpdatesWorker(FatcatWorker):
                 if doi.startswith(prefix):
                     return False
 
+        # figshare
+        if doi and doi.startswith('10.6084/') or doi.startswith('10.25384/'):
+            # don't crawl "most recent version" (aka "group") DOIs
+            if not release.version:
+                return False
+
+        # zenodo
+        if doi and doi.startswith('10.5281/'):
+            # if this is a "grouping" DOI of multiple "version" DOIs, do not crawl (will crawl the versioned DOIs)
+            if release.extra and release.extra.get('relations'):
+                for rel in release.extra['relations']:
+                    if (rel.get('relationType') == 'HasVersion' and rel.get('relatedIdentifier', '').startswith('10.5281/')):
+                        return False
+
         return True
 
     def run(self):
-- 
cgit v1.2.3


From 5eddc9b9aefbd7ae197d441b8a7af1fded940e2d Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 11 Aug 2020 15:23:25 -0700
Subject: entity update: default to ingest non-OA works

---
 python/fatcat_tools/workers/changelog.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'python/fatcat_tools')

diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index dc5ef299..f7df6748 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -89,7 +89,7 @@ class EntityUpdatesWorker(FatcatWorker):
         self.ingest_file_request_topic = ingest_file_request_topic
         self.poll_interval = poll_interval
         self.consumer_group = "entity-updates"
-        self.ingest_oa_only = True
+        self.ingest_oa_only = False
         self.ingest_pdf_doi_prefix_blocklist = [
             # gbif.org: many DOIs, not PDF fulltext
             "10.15468/",
@@ -191,15 +191,16 @@ class EntityUpdatesWorker(FatcatWorker):
             'stub',
         )
 
-        # accept list sets a default "crawl it" despite OA metadata for
-        # known-OA DOI prefixes
-        in_acceptlist = False
-        if doi:
-            for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
-                if doi.startswith(prefix):
-                    in_acceptlist = True
-
         if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
+
+            # accept list sets a default "crawl it" despite OA metadata for
+            # known-OA DOI prefixes
+            in_acceptlist = False
+            if doi:
+                for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
+                    if doi.startswith(prefix):
+                        in_acceptlist = True
+
             es = release_to_elasticsearch(release)
             # most datacite documents are in IRs and should be crawled
             is_datacite_doc = False
-- 
cgit v1.2.3


From 03d2004717d36962aef1bd373d59ce799d7db9ab Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 11 Aug 2020 15:45:36 -0700
Subject: entity update: change big5 ingest behavior

In addition to changing the OA default, this was the main intended
behavior change in this group of commits: want to ingest fewer attempts
that we *expect* to fail, but default to ingest/crawl attempt if we are
uncertain. This is because there is a long tail of journals that
register DOIs and are defacto OA (fulltext is available), but we don't
have metadata indicating them as such.
---
 python/fatcat_tools/workers/changelog.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'python/fatcat_tools')

diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index f7df6748..65a8fcd8 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -159,6 +159,7 @@ class EntityUpdatesWorker(FatcatWorker):
         link_source = ingest_request.get('ingest_request')
         ingest_type = ingest_request.get('ingest_type')
         doi = ingest_request.get('ext_ids', {}).get('doi')
+        es = release_to_elasticsearch(release)
 
         is_document = release.release_type in (
             'article',
@@ -191,17 +192,16 @@ class EntityUpdatesWorker(FatcatWorker):
             'stub',
         )
 
-        if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
+        # accept list sets a default "crawl it" despite OA metadata for
+        # known-OA DOI prefixes
+        in_acceptlist = False
+        if doi:
+            for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
+                if doi.startswith(prefix):
+                    in_acceptlist = True
 
-            # accept list sets a default "crawl it" despite OA metadata for
-            # known-OA DOI prefixes
-            in_acceptlist = False
-            if doi:
-                for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
-                    if doi.startswith(prefix):
-                        in_acceptlist = True
+        if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
 
-            es = release_to_elasticsearch(release)
             # most datacite documents are in IRs and should be crawled
             is_datacite_doc = False
             if release.extra and ('datacite' in release.extra) and is_document:
@@ -209,6 +209,12 @@ class EntityUpdatesWorker(FatcatWorker):
             if not (es['is_oa'] or in_acceptlist or is_datacite_doc):
                 return False
 
+        # big publishers *generally* have accurate OA metadata, use
+        # preservation networks, and block our crawlers. So unless OA, or
+        # explicitly on accept list, or not preserved, skip crawling
+        if es['publisher_type'] == 'big5' and es['is_preserved'] and not (es['is_oa'] or in_acceptlist):
+            return False
+
         # if ingest_type is pdf but release_type is almost certainly not a PDF,
         # skip it. This is mostly a datacite thing.
         if ingest_type == "pdf" and is_not_pdf:
-- 
cgit v1.2.3