dblp release importer: container_id lookup TSV, and dump JSON mode

author: Bryan Newbold <bnewbold@robocracy.org> 2020-12-16 23:49:09 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2020-12-17 23:03:08 -0800
commit: b2f53077bd05a536b5fdb551755a559b653421d3 (patch)
tree: 1de40b893f0798377c0af92a4dada3278e4d1f2d
parent: d312ddfa0340b702ac858b08f8f91f785048af0b (diff)
download: fatcat-b2f53077bd05a536b5fdb551755a559b653421d3.tar.gz
fatcat-b2f53077bd05a536b5fdb551755a559b653421d3.zip
2 files changed, 73 insertions, 13 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 5ee81b92..90b53e5c 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -275,9 +275,10 @@ def run_doaj_article(args):
 
 def run_dblp_release(args):
     dwi = DblpReleaseImporter(args.api,
-        args.issn_map_file,
+        dblp_container_map_file=args.dblp_container_map_file,
         edit_batch_size=args.batch_size,
         do_updates=args.do_updates,
+        dump_json_mode=args.dump_json_mode,
     )
     Bs4XmlLargeFilePusher(
         dwi,
@@ -660,12 +661,15 @@ def main():
     sub_dblp_release.add_argument('xml_file',
         help="File with DBLP XML to import from",
         default=sys.stdin, type=argparse.FileType('rb'))
-    sub_dblp_release.add_argument('--issn-map-file',
-        help="ISSN to ISSN-L mapping file",
+    sub_dblp_release.add_argument('--dblp-container-map-file',
+        help="file path to dblp prefix to container_id TSV file",
         default=None, type=argparse.FileType('r'))
     sub_dblp_release.add_argument('--do-updates',
         action='store_true',
         help="update any pre-existing release entities")
+    sub_dblp_release.add_argument('--dump-json-mode',
+        action='store_true',
+        help="print release entities to stdout instead of importing")
     sub_dblp_release.set_defaults(
         func=run_dblp_release,
         auth_var="FATCAT_AUTH_WORKER_DBLP",
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index 7e19855f..e4d20370 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -8,22 +8,25 @@ and passed to `parse_record()`.
 """
 
 import sys  # noqa: F401
+import json
 import warnings
 import datetime
 from typing import List, Optional, Any
 
 import fatcat_openapi_client
+
 from fatcat_tools.normal import (clean_doi, clean_str, parse_month,
     clean_orcid,
     clean_arxiv_id, clean_wikidata_qid, clean_isbn13)
 from fatcat_tools.importers.common import EntityImporter
+from fatcat_tools.transforms import entity_to_dict
 
 
 class DblpReleaseImporter(EntityImporter):
 
     def __init__(self,
                  api,
-                 issn_map_file,
+                 dblp_container_map_file=None,
                  **kwargs):
 
         eg_desc = kwargs.get(
@@ -36,13 +39,13 @@ class DblpReleaseImporter(EntityImporter):
         # ensure default is to not do updates with this worker (override super() default)
         kwargs['do_updates'] = kwargs.get("do_updates", False)
         super().__init__(api,
-                         issn_map_file=issn_map_file,
                          editgroup_description=eg_desc,
                          editgroup_extra=eg_extra,
                          **kwargs)
 
+        self.dump_json_mode = kwargs.get("dump_json_mode", False)
         self.this_year = datetime.datetime.now().year
-        self.read_issn_map_file(issn_map_file)
+        self.read_dblp_container_map_file(dblp_container_map_file)
 
     ELEMENT_TYPES = [
         "article",
@@ -55,6 +58,25 @@ class DblpReleaseImporter(EntityImporter):
         #"data",  # no instances in 2020-11 dump
     ]
 
+    def read_dblp_container_map_file(self, dblp_container_map_file) -> None:
+        self._dblp_container_map = dict()
+        if not dblp_container_map_file:
+            print("Not loading a dblp prefix container map file; entities will fail to import", file=sys.stderr)
+            return
+        print("Loading dblp prefix container map file...", file=sys.stderr)
+        for line in dblp_container_map_file:
+            if line.startswith("dblp_prefix") or len(line) == 0:
+                continue
+            (prefix, container_id) = line.split()[0:2]
+            assert len(container_id) == 26
+            self._dblp_container_map[prefix] = container_id
+        print("Got {} dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr)
+
+    def lookup_dblp_prefix(self, prefix):
+        if not prefix:
+            return None
+        return self._dblp_container_map.get(prefix)
+
     def want(self, xml_elem):
         if not xml_elem.name in self.ELEMENT_TYPES:
             self.counts['skip-type'] += 1
@@ -175,12 +197,8 @@ class DblpReleaseImporter(EntityImporter):
 
         container_id = None
         if dblp_prefix:
-            # XXX: container lookup by dblp_prefix, from local something
-            pass
-            #container_id = self.lookup_dblp_prefix(dblp_prefix)
-            #if not container_id:
-            #    self.counts['skip-dblp-prefix-lookup'] += 1
-            #    return False
+            container_id = self.lookup_dblp_prefix(dblp_prefix)
+            # note: we will skip later if couldn't find prefix
 
         publisher = clean_str(xml_elem.publisher and xml_elem.publisher.text)
         volume = clean_str(xml_elem.volume and xml_elem.volume.text)
@@ -233,7 +251,7 @@ class DblpReleaseImporter(EntityImporter):
             dblp_extra['booktitle'] = booktitle
 
         if release_year and release_month:
-            # TODO: schema migration
+            # TODO: release_month schema migration
             extra['release_month'] = release_month
 
         if dblp_extra:
@@ -259,6 +277,17 @@ class DblpReleaseImporter(EntityImporter):
             extra=extra,
         )
         re = self.biblio_hacks(re)
+
+        if self.dump_json_mode:
+            re_dict = entity_to_dict(re, api_client=self.api.api_client)
+            re_dict['_dblp_ee_urls'] = self.dblp_ext_urls(xml_elem)
+            re_dict['_dblp_prefix'] = dblp_prefix
+            print(json.dumps(re_dict, sort_keys=True))
+            return False
+
+        if not re.container_id:
+            self.counts["skip-dblp-container-missing"] += 1
+            return False
         return re
 
     @staticmethod
@@ -442,3 +471,30 @@ class DblpReleaseImporter(EntityImporter):
             wikidata_qid=wikidata_qid,
             arxiv=arxiv_id,
         )
+
+    def dblp_ext_urls(self, xml_elem: Any) -> List[str]:
+        """
+        Takes a full XML object and returns a list of possible-fulltext URLs.
+
+        Used only in JSON dump mode, with the intent of transforming into
+        sandcrawler ingest requests.
+        """
+        EXTID_PATTERNS = [
+            '://doi.acm.org/',
+            '://doi.ieeecomputersociety.org/',
+            'doi.org/10.',
+            'wikidata.org/entity/Q',
+            '://arxiv.org/abs/',
+        ]
+        urls = []
+        for ee in xml_elem.find_all('ee'):
+            url = ee.text
+            skip = False
+            for pattern in EXTID_PATTERNS:
+                if pattern in url:
+                    skip = True
+                    break
+            if skip:
+                break
+            urls.append(url)
+        return urls
author	Bryan Newbold <bnewbold@robocracy.org>	2020-12-16 23:49:09 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2020-12-17 23:03:08 -0800
commit	b2f53077bd05a536b5fdb551755a559b653421d3 (patch)
tree	1de40b893f0798377c0af92a4dada3278e4d1f2d
parent	d312ddfa0340b702ac858b08f8f91f785048af0b (diff)
download	fatcat-b2f53077bd05a536b5fdb551755a559b653421d3.tar.gz fatcat-b2f53077bd05a536b5fdb551755a559b653421d3.zip