crossref importer checks for existing DOIs

author: Bryan Newbold <bnewbold@robocracy.org> 2018-11-21 11:58:46 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2018-11-21 11:58:46 -0800
commit: 7ec413416acb2b3d7da0be32b78982316b9c696f (patch)
tree: cc0799316a0875d7aea6f1d9fddc03fb5e505410
parent: 008366697aba8046fd33ae1f3707972d87c9a342 (diff)
download: fatcat-7ec413416acb2b3d7da0be32b78982316b9c696f.tar.gz
fatcat-7ec413416acb2b3d7da0be32b78982316b9c696f.zip
5 files changed, 42 insertions, 9 deletions
diff --git a/python/README_import.md b/python/README_import.md
index 9ee24f8e..cc9a94e1 100644
--- a/python/README_import.md
+++ b/python/README_import.md
@@ -57,7 +57,7 @@ Usually 24 hours or so on fast production machine.
 Unknown speed!
 
     # No file update for the first import...
-    zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-update -
+    zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-updates -
 
     # ... but do on the second
     zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched -
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index aad4ee57..fe5b24a6 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -8,7 +8,8 @@ from fatcat_tools.importers import CrossrefImporter, OrcidImporter, \
 
 def run_crossref(args):
     fci = CrossrefImporter(args.host_url, args.issn_map_file,
-        args.extid_map_file, create_containers=(not args.no_create_containers))
+        args.extid_map_file, create_containers=(not args.no_create_containers),
+        check_existing=(not args.no_release_updates))
     if args.kafka_mode:
         consumer = make_kafka_consumer(
             args.kafka_hosts, args.kafka_env, "api-crossref", "fatcat-import")
@@ -29,7 +30,7 @@ def run_issn(args):
 
 def run_matched(args):
     fmi = MatchedImporter(args.host_url,
-        skip_file_update=args.no_file_update)
+        skip_file_updates=args.no_file_updates)
     fmi.process_batch(args.json_file, size=args.batch_size)
     fmi.describe_run()
 
@@ -74,6 +75,9 @@ def main():
     sub_crossref.add_argument('--kafka-mode',
         action='store_true',
         help="consume from kafka topic (not stdin)")
+    sub_crossref.add_argument('--no-release-updates',
+        action='store_true',
+        help="don't lookup existing DOIs, just insert (only for bootstrap)")
 
     sub_orcid = subparsers.add_parser('orcid')
     sub_orcid.set_defaults(func=run_orcid)
@@ -98,7 +102,7 @@ def main():
     sub_matched.add_argument('json_file',
         help="JSON file to import from (or stdin)",
         default=sys.stdin, type=argparse.FileType('r'))
-    sub_matched.add_argument('--no-file-update',
+    sub_matched.add_argument('--no-file-updates',
         action='store_true',
         help="don't lookup existing files, just insert (only for bootstrap)")
     sub_matched.add_argument('--batch-size',
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 475afdb0..385a8235 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -40,7 +40,7 @@ class CrossrefImporter(FatcatImporter):
     See https://github.com/CrossRef/rest-api-doc for JSON schema notes
     """
 
-    def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True):
+    def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True, check_existing=True):
         super().__init__(host_url, issn_map_file)
         self.extid_map_db = None
         if extid_map_file:
@@ -50,6 +50,7 @@ class CrossrefImporter(FatcatImporter):
         else:
             print("Not using external ID map")
         self.create_containers = create_containers
+        self.check_existing = check_existing
 
     def lookup_ext_ids(self, doi):
         if self.extid_map_db is None:
@@ -85,6 +86,20 @@ class CrossrefImporter(FatcatImporter):
                 'book-track', 'proceedings-series'):
             return None
 
+        # lookup existing DOI
+        existing_release = None
+        if self.check_existing:
+            try:
+                existing_release = self.api.lookup_release(doi=obj['DOI'].lower())
+            except fatcat_client.rest.ApiException as err:
+                if err.status != 404:
+                    raise err
+
+        # eventually we'll want to support "updates", but for now just skip if
+        # entity already exists
+        if existing_release:
+            return None
+
         # contribs
         def do_contribs(obj_list, ctype):
             contribs = []
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 732fccbe..6f83dd23 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -37,12 +37,12 @@ class MatchedImporter(FatcatImporter):
     - core_id, wikidata_id, pmcid, pmid: not as lists
     """
 
-    def __init__(self, host_url, skip_file_update=False, default_mime=None,
+    def __init__(self, host_url, skip_file_updates=False, default_mime=None,
             default_link_rel="web"):
         super().__init__(host_url)
         self.default_mime = default_mime
         self.default_link_rel = default_link_rel
-        self.skip_file_update = skip_file_update
+        self.skip_file_updates = skip_file_updates
 
     def make_url(self, raw):
         rel = self.default_link_rel
@@ -61,7 +61,7 @@ class MatchedImporter(FatcatImporter):
 
         # lookup sha1, or create new entity
         fe = None
-        if not self.skip_file_update:
+        if not self.skip_file_updates:
             try:
                 fe = self.api.lookup_file(sha1=sha1)
             except fatcat_client.rest.ApiException as err:
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index c129e729..1fb4a70f 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -7,7 +7,12 @@ from fatcat_tools.importers import CrossrefImporter
 @pytest.fixture(scope="function")
 def crossref_importer():
     with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
-        yield CrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3')
+        yield CrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3', check_existing=False)
+
+@pytest.fixture(scope="function")
+def crossref_importer_existing():
+    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+        yield CrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3', check_existing=True)
 
 def test_crossref_importer_batch(crossref_importer):
     with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
@@ -61,3 +66,12 @@ def test_crossref_dict_parse(crossref_importer):
         assert r.refs[0].container_name == "J. Chem. Phys."
         assert r.refs[0].extra['crossref'] == {"volume": "57", "author": "Swenson", "doi": "10.1063/1.1678462"}
         assert r.refs[3].container_name == "Large Order Perturbation Theory and Summation Methods in Quantum Mechanics, Lecture Notes in Chemistry"
+
+def test_stateful_checking(crossref_importer_existing):
+    with open('tests/files/crossref-works.single.json', 'r') as f:
+        # not a single line, a whole document
+        raw = json.loads(f.read())
+        # might not exist yet...
+        crossref_importer_existing.process_source([json.dumps(raw)])
+        # ok, make sure we get 'None' back
+        assert crossref_importer_existing.parse_crossref_dict(raw) is None
author	Bryan Newbold <bnewbold@robocracy.org>	2018-11-21 11:58:46 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2018-11-21 11:58:46 -0800
commit	7ec413416acb2b3d7da0be32b78982316b9c696f (patch)
tree	cc0799316a0875d7aea6f1d9fddc03fb5e505410
parent	008366697aba8046fd33ae1f3707972d87c9a342 (diff)
download	fatcat-7ec413416acb2b3d7da0be32b78982316b9c696f.tar.gz fatcat-7ec413416acb2b3d7da0be32b78982316b9c696f.zip