From 82ea0b02b29d6be0542eb43f00710a23ed8516c2 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 15 May 2019 15:06:35 -0700
Subject: initial JSTOR importer

---
 python/fatcat_tools/importers/__init__.py |   1 +
 python/fatcat_tools/importers/jstor.py    | 270 ++++++++++++++++++++++++++++++
 python/parse_jstor_xml.py                 | 178 --------------------
 3 files changed, 271 insertions(+), 178 deletions(-)
 create mode 100644 python/fatcat_tools/importers/jstor.py
 delete mode 100644 python/parse_jstor_xml.py

(limited to 'python')

diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index ecbfe38e..497946ea 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -15,6 +15,7 @@ To run an import you combine two classes; one each of:
 from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk
 from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
 from .jalc import JalcImporter
+from .jstor import JstorImporter
 from .grobid_metadata import GrobidMetadataImporter
 from .journal_metadata import JournalMetadataImporter
 from .matched import MatchedImporter
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
new file mode 100644
index 00000000..6ab320c3
--- /dev/null
+++ b/python/fatcat_tools/importers/jstor.py
@@ -0,0 +1,270 @@
+
+
+import sys
+import json
+import sqlite3
+import datetime
+import itertools
+import subprocess
+from bs4 import BeautifulSoup
+
+import fatcat_client
+from .common import EntityImporter, clean
+
+# is this just ISO 3-char to ISO 2-char?
+# XXX: more entries
+JSTOR_LANG_MAP = {
+    'eng': 'en',
+}
+
+# XXX: more entries
+JSTOR_CONTRIB_MAP = {
+    'author': 'author',
+    'editor': 'editor',
+    'translator': 'translator',
+    'illustrator': 'illustrator',
+}
+
+class JstorImporter(EntityImporter):
+    """
+    Importer for JSTOR bulk XML metadata (eg, from their Early Journals
+    Collection)
+    """
+
+    def __init__(self, api, issn_map_file, **kwargs):
+
+        eg_desc = kwargs.get('editgroup_description',
+            "Automated import of JSTOR XML metadata")
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JstorImporter')
+        super().__init__(api,
+            issn_map_file=issn_map_file,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs)
+
+        self.create_containers = kwargs.get('create_containers')
+
+        self.read_issn_map_file(issn_map_file)
+
+    def want(self, obj):
+        return True
+
+    def parse_record(self, article):
+
+        journal_meta = article.front.find("journal-meta")
+        article_meta = article.front.find("article-meta")
+
+        extra = dict()
+        extra_jstor = dict()
+
+        journal_title = journal_meta.find("journal-title").string
+        publisher = journal_meta.find("publisher-name").string
+        issn = journal_meta.find("issn")
+        if issn:
+            issn = issn.string
+            if len(issn) == 8:
+                issn = "{}-{}".format(issn[0:4], issn[4:8])
+            else:
+                assert len(issn) == 9
+        # XXX:
+        container_id = None
+        container = dict(
+            name=journal_title,
+            publisher=publisher,
+            issn=issn,   # TODO: ISSN-L lookup...
+        )
+
+        doi = article_meta.find("article-id", {"pub-id-type": "doi"})
+        if doi:
+            doi = doi.string.lower().strip()
+
+        jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"})
+        if jstor_id:
+            jstor_id = jstor_id.string
+
+        title = article_meta.find("article-title")
+        if title:
+            title = title.string.strip()
+            if title.endswith('.'):
+                title = title[:-1]
+
+        contribs = []
+        cgroup = article_meta.find("contrib-group")
+        if cgroup:
+            for c in cgroup.find_all("contrib"):
+                given = c.find("given-names")
+                surname = c.find("surname")
+                if given and surname:
+                    name = "{} {}".format(given.string, surname.string)
+                elif surname:
+                    name = surname.string
+                else:
+                    name = None
+                contribs.append(fatcat_client.ReleaseContrib(
+                    role=JSTOR_CONTRIB_MAP.get(c['contrib-type']),
+                    raw_name=clean(name),
+                    given_name=clean(given.string),
+                    surname=clean(surname.string),
+                ))
+
+        release_year = None
+        release_date = None
+        pub_date = article_meta.find('pub-date')
+        if pub_date and pub_date.year:
+            release_year = int(pub_date.year.string)
+            if pub_date.month and pub_date.day:
+                release_date = datetime.date(
+                    release_year,
+                    int(pub_date.month.string),
+                    int(pub_date.day.string))
+        
+        volume = None
+        if article_meta.volume:
+            volume = article_meta.volume.string or None
+
+        issue = None
+        if article_meta.issue:
+            issue = article_meta.issue.string or None
+
+        pages = None
+        if article_meta.find("page-range"):
+            pages = article_meta.find("page-range").string
+        elif article_meta.fpage:
+            pages = article_meta.fpage.string
+
+        language = None
+        cm = article_meta.find("custom-meta")
+        if cm.find("meta-name").string == "lang":
+            language = cm.find("meta-value").string
+            language = JSTOR_LANG_MAP.get(language)
+
+        release_type = "article-journal"
+        if "[Abstract]" in title:
+            # TODO: strip the "[Abstract]" bit?
+            release_type = "abstract"
+        elif "[Editorial" in title:
+            release_type = "editorial"
+        elif "[Letter" in title:
+            release_type = "letter"
+        elif "[Poem" in title or "[Photograph" in title:
+            release_type = None
+
+        if title.startswith("[") and title.endswith("]"):
+            # strip brackets if that is all that is there (eg, translation or non-english)
+            title = title[1:-1]
+
+        # JSTOR issue-id
+        if article_meta.find('issue-id'):
+            issue_id = clean(article_meta.find('issue-id').string)
+            if issue_id:
+                extra_jstor['issue_id'] = issue_id
+
+        # JSTOR journal-id
+        # XXX:
+
+        # everything in JSTOR is published
+        release_stage = "published"
+
+        # extra:
+        #   withdrawn_date
+        #   translation_of
+        #   subtitle
+        #   aliases
+        #   container_name
+        #   group-title
+        #   pubmed: retraction refs
+        if extra_jstor:
+            extra['jstor'] = extra_jstor
+        if not extra:
+            extra = None
+
+        re = fatcat_client.ReleaseEntity(
+            #work_id
+            title=title,
+            #original_title
+            release_type=release_type,
+            release_stage=release_stage,
+            release_date=release_date.isoformat(),
+            release_year=release_year,
+            ext_ids=fatcat_client.ReleaseExtIds(
+                doi=doi,
+                jstor=jstor_id,
+            ),
+            volume=volume,
+            issue=issue,
+            pages=pages,
+            publisher=publisher,
+            language=language,
+            #license_slug
+
+            # content, mimetype, lang
+            #abstracts=abstracts,
+
+            contribs=contribs,
+
+            # key, year, container_name, title, locator
+            # extra: volume, authors, issue, publisher, identifiers
+            #refs=refs,
+
+            #   name, type, publisher, issnl
+            #   extra: issnp, issne, original_name, languages, country
+            container_id=container_id,
+
+            extra=extra,
+        )
+        return re
+
+    def try_update(self, re):
+
+        # first, lookup existing by JSTOR id (which much be defined)
+        existing = None
+        try:
+            existing = self.api.lookup_release(jstor=re.ext_ids.jstor)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        # then try DOI lookup if there is one
+        if not existing and re.ext_ids.doi:
+            try:
+                existing = self.api.lookup_release(jstor=re.ext_ids.jstor)
+            except fatcat_client.rest.ApiException as err:
+                if err.status != 404:
+                    raise err
+
+        if existing and existing.ext_ids.jstor:
+            # don't update if it already has JSTOR ID
+            self.counts['exists'] += 1
+            return False
+        elif existing:
+            # but do update if only DOI was set
+            existing.ext_ids.jstor = re.jstor_id
+            existing.extra['jstor'] = re.extra['jstor']
+            self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+            self.counts['update'] += 1
+            return False
+
+        return True
+
+    def insert_batch(self, batch):
+        self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch(
+            editgroup=fatcat_client.Editgroup(
+                description=self.editgroup_description,
+                extra=self.editgroup_extra),
+            entity_list=batch))
+
+    def parse_file(self, handle):
+
+        # 1. open with beautiful soup
+        soup = BeautifulSoup(handle, "xml")
+
+        # 2. iterate over articles, call parse_article on each
+        for article in soup.find_all("article"):
+            resp = self.parse_record(article)
+            print(json.dumps(resp))
+            #sys.exit(-1)
+
+if __name__=='__main__':
+    parser = JstorImporter()
+    parser.parse_file(open(sys.argv[1]))
diff --git a/python/parse_jstor_xml.py b/python/parse_jstor_xml.py
deleted file mode 100644
index 04f2b18e..00000000
--- a/python/parse_jstor_xml.py
+++ /dev/null
@@ -1,178 +0,0 @@
-
-import sys
-import json
-import datetime
-from bs4 import BeautifulSoup
-from bs4.element import NavigableString
-
-
-class JstorXmlParser():
-    """
-    Converts JSTOR bulk XML metadata (eg, from their Early Journals Collection)
-    """
-
-    def __init__(self):
-        pass
-
-    def parse_file(self, handle):
-
-        # 1. open with beautiful soup
-        soup = BeautifulSoup(handle, "xml")
-
-        # 2. iterate over articles, call parse_article on each
-        for article in soup.find_all("article"):
-            resp = self.parse_article(article)
-            print(json.dumps(resp))
-            #sys.exit(-1)
-
-    def parse_article(self, article):
-
-        journal_meta = article.front.find("journal-meta")
-        article_meta = article.front.find("article-meta")
-
-        extra = dict()
-        extra_jstor = dict()
-
-        journal_title = journal_meta.find("journal-title").string
-        publisher = journal_meta.find("publisher-name").string
-        issn = journal_meta.find("issn")
-        if issn:
-            issn = issn.string
-            if len(issn) == 8:
-                issn = "{}-{}".format(issn[0:4], issn[4:8])
-            else:
-                assert len(issn) == 9
-        container = dict(
-            name=journal_title,
-            publisher=publisher,
-            issn=issn,   # TODO: ISSN-L lookup...
-        )
-
-        doi = article_meta.find("article-id", attr={"pub-id-type": "doi"})
-        if doi:
-            doi = doi.string.lower().strip()
-
-        title = article_meta.find("article-title")
-        if title:
-            title = title.string.strip()
-            if title.endswith('.'):
-                title = title[:-1]
-
-        contribs = []
-        cgroup = article_meta.find("contrib-group")
-        if cgroup:
-            for c in cgroup.find_all("contrib"):
-                given = c.find("given-names")
-                surname = c.find("surname")
-                if given and surname:
-                    name = "{} {}".format(given.string, surname.string)
-                elif surname:
-                    name = surname.string
-                else:
-                    name = None
-                contribs.append(dict(
-                    role=c['contrib-type'],   # XXX: types? mapping?
-                    raw_name=name,
-                ))
-
-        release_year = None
-        release_date = None
-        pub_date = article_meta.find('pub-date')
-        if pub_date and pub_date.year:
-            release_year = int(pub_date.year.string)
-            if pub_date.month and pub_date.day:
-                release_date = datetime.date(
-                    release_year,
-                    int(pub_date.month.string),
-                    int(pub_date.day.string))
-        
-        volume = None
-        if article_meta.volume:
-            volume = article_meta.volume.string or None
-
-        issue = None
-        if article_meta.issue:
-            issue = article_meta.issue.string or None
-
-        pages = None
-        if article_meta.find("page-range"):
-            pages = article_meta.find("page-range").string
-        elif article_meta.fpage:
-            pages = article_meta.fpage.string
-
-        language = None
-        cm = article_meta.find("custom-meta")
-        if cm.find("meta-name").string == "lang":
-            language = cm.find("meta-value").string
-
-        release_type = "article-journal"
-        if "[Abstract]" in title:
-            release_type = "abstract"
-        elif "[Editorial" in title:
-            release_type = "editorial"
-        elif "[Letter" in title:
-            release_type = "letter"
-        elif "[Poem" in title or "[Photograph" in title:
-            release_type = None
-
-        if title.startswith("[") and title.endswith("]"):
-            # strip brackets if that is all that is there (eg, translation or non-english)
-            title = title[1:-1]
-
-        # everything in JSTOR is published
-        release_status = "published"
-
-        if extra_jstor:
-            extra['jstor'] = extra_jstor
-        if not extra:
-            extra = None
-
-        re = dict(
-            issn=issn, # not an entity field
-            #work_id
-            title=title,
-            #original_title
-            release_type=release_type,
-            release_status=release_status,
-            release_date=release_date.isoformat(),
-            release_year=release_year,
-            doi=doi,
-            #pmid
-            #pmcid
-            #isbn13     # TODO: ?
-            volume=volume,
-            issue=issue,
-            pages=pages,
-            publisher=publisher,
-            language=language,
-            #license_slug   # TODO: ?
-
-            # content, mimetype, lang
-            #abstracts=abstracts,
-
-            # raw_name, role, raw_affiliation, extra
-            contribs=contribs,
-
-            # key, year, container_name, title, locator
-            # extra: volume, authors, issue, publisher, identifiers
-            #refs=refs,
-
-            #   name, type, publisher, issnl
-            #   extra: issnp, issne, original_name, languages, country
-            container=container,
-
-            # extra:
-            #   withdrawn_date
-            #   translation_of
-            #   subtitle
-            #   aliases
-            #   container_name
-            #   group-title
-            #   pubmed: retraction refs
-            extra=extra,
-        )
-        return re
-
-if __name__=='__main__':
-    parser = JstorXmlParser()
-    parser.parse_file(open(sys.argv[1]))
-- 
cgit v1.2.3