5 files changed, 436 insertions, 210 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index f5ff43e5..ecbfe38e 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -12,8 +12,9 @@ To run an import you combine two classes; one each of:
 
 """
 
-from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, KafkaJsonPusher, make_kafka_consumer, clean
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk
 from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
+from .jalc import JalcImporter
 from .grobid_metadata import GrobidMetadataImporter
 from .journal_metadata import JournalMetadataImporter
 from .matched import MatchedImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 282f775c..7fca38cf 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -7,13 +7,16 @@ import ftfy
 import sqlite3
 import itertools
 import subprocess
+import unicodedata
 from collections import Counter
 import pykafka
+from bs4 import BeautifulSoup
 
 import fatcat_client
 from fatcat_client.rest import ApiException
 
 
+DATE_FMT = "%Y-%m-%d"
 SANE_MAX_RELEASES = 200
 SANE_MAX_URLS = 100
 
@@ -52,6 +55,23 @@ def test_clean():
     assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
     assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
 
+def is_cjk(s):
+    if not s:
+        return False
+    return unicodedata.name(s[0]).startswith("CJK")
+
+def test_is_cjk():
+    assert is_cjk(None) == False
+    assert is_cjk('') == False
+    assert is_cjk('blah') == False
+    assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True
+    assert is_cjk('菊') == True
+    assert is_cjk('ひヒ') == True
+    assert is_cjk('english with ひヒ') == True
+    assert is_cjk('き゚ゅ') == True
+    assert is_cjk('水道') == True
+    assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True
+
 DOMAIN_REL_MAP = {
     "archive.org": "archive",
     # LOCKSS, Portico, DuraSpace, etc would also be "archive"
@@ -456,6 +476,22 @@ class SqlitePusher(RecordPusher):
         return counts
 
 
+class Bs4XmlFilePusher(RecordPusher):
+
+    def __init__(self, importer, xml_file, record_tag, **kwargs):
+        self.importer = importer
+        self.xml_file = xml_file
+        self.record_tag = record_tag
+
+    def run(self):
+        soup = BeautifulSoup(self.xml_file, "xml")
+        for record in soup.find_all(self.record_tag):
+            self.importer.push_record(record)
+        counts = self.importer.finish()
+        print(counts)
+        return counts
+
+
 class KafkaJsonPusher(RecordPusher):
 
     def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
new file mode 100644
index 00000000..d7b89727
--- /dev/null
+++ b/python/fatcat_tools/importers/jalc.py
@@ -0,0 +1,310 @@
+
+import sys
+import json
+import sqlite3
+import datetime
+import itertools
+import subprocess
+from bs4 import BeautifulSoup
+
+import fatcat_client
+from .common import EntityImporter, clean, is_cjk, DATE_FMT
+
+
+class JalcImporter(EntityImporter):
+    """
+    Importer for JALC DOI metadata.
+
+    NOTE: some JALC DOIs seem to get cross-registered with Crossref
+    """
+
+    def __init__(self, api, issn_map_file, **kwargs):
+
+        eg_desc = kwargs.get('editgroup_description',
+            "Automated import of JALC DOI metadata")
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JalcImporter')
+        super().__init__(api,
+            issn_map_file=issn_map_file,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs)
+
+        self.create_containers = kwargs.get('create_containers')
+        extid_map_file = kwargs.get('extid_map_file')
+        self.extid_map_db = None
+        if extid_map_file:
+            db_uri = "file:{}?mode=ro".format(extid_map_file)
+            print("Using external ID map: {}".format(db_uri))
+            self.extid_map_db = sqlite3.connect(db_uri, uri=True)
+        else:
+            print("Not using external ID map")
+
+        self.read_issn_map_file(issn_map_file)
+
+    def lookup_ext_ids(self, doi):
+        if self.extid_map_db is None:
+            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+        row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
+            [doi.lower()]).fetchone()
+        if row is None:
+            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+        row = [str(cell or '') or None for cell in row]
+        return dict(
+            core_id=row[0],
+            pmid=row[1],
+            pmcid=row[2],
+            wikidata_qid=row[3],
+            # TODO:
+            arxiv_id=None,
+            jstor_id=None,
+        )
+
+    def want(self, obj):
+        return True
+
+    def parse_record(self, record):
+        """
+        record is a beautiful soup object
+        returns a ReleaseEntity, or None
+
+        In JALC metadata, both English and Japanese records are given for most
+        fields.
+        """
+
+        extra = dict()
+        extra_jalc = dict()
+
+        titles = record.find_all("title")
+        title = titles[0].string.strip()
+        original_title = None
+        if title.endswith('.'):
+            title = title[:-1]
+        if len(titles) > 1:
+            original_title = titles[1].string.strip()
+            if original_title.endswith('.'):
+                original_title = original_title[:-1]
+
+        doi = None
+        if record.doi:
+            doi = record.doi.string.lower().strip()
+            assert doi.startswith('10.')
+
+        contribs = []
+        people = record.find_all("Person")
+        if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string):
+            # both english and japanese names are included for every author
+            for i in range(int(len(people)/2)):
+                eng = people[i*2]
+                jpn = people[i*2 + 1]
+                # there isn't always an english name though? TODO
+                name = eng
+                if not name.find('name'):
+                    name = jpn
+                contrib = fatcat_client.ReleaseContrib(
+                    raw_name=clean(name.find('name').string),
+                    given_name=clean(name.find('givenName').string),
+                    surname=clean(name.find('familyName').string),
+                    role='author',
+                )
+                if eng.find('name') and jpn.find('name'):
+                    contrib.extra = {
+                        'original_name': {
+                            'lang': 'ja',
+                            'raw_name': clean(jpn.find('name').string),
+                            'given_name': clean(jpn.find('givenName').string),
+                            'surname': clean(jpn.find('familyName').string),
+                        }}
+                contribs.append(contrib)
+        elif people:
+            # TODO: test for this codepath?
+            for eng in people:
+                contrib = dict(
+                    raw_name=clean(eng.find('name').string),
+                    given_name=clean(eng.find('givenName').string),
+                    surname=clean(eng.find('familyName').string),
+                    role='author',
+                )
+                contribs.append(contrib)
+
+        release_year = None
+        release_date = None
+        date = record.date or None
+        if date:
+            date = date.string
+            if len(date) is 10:
+                release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date()
+                release_year = release_date.year
+                release_date = release_date.isoformat()
+            elif len(date) is 4:
+                release_year = int(date)
+
+        pages = None
+        if record.startingPage:
+            pages = record.startingPage.string
+            if record.endingPage:
+                pages = "{}-{}".format(pages, record.endingPage.string)
+        volume = None
+        if record.volume:
+            volume = record.volume.string
+        issue = None
+        if record.number:
+            # note: number/issue transform
+            issue = record.number.string
+
+        # container
+        issn = None
+        issn_list = record.find_all("issn")
+        if issn_list:
+            # if we wanted the other ISSNs, would also need to uniq the list.
+            # But we only need one to lookup ISSN-L/container
+            issn = issn_list[0].string
+        issnl = self.issn2issnl(issn)
+        container_id = None
+        if issnl:
+            container_id = self.lookup_issnl(issnl)
+
+        publisher = None
+        container_name = None
+        container_extra = dict()
+
+        if record.publicationName:
+            pubs = [p.string.strip() for p in record.find_all("publicationName")]
+            pubs = [p for p in pubs if p]
+            assert(pubs)
+            if len(pubs) > 1 and pubs[0] == pubs[1]:
+                pubs = [pubs[0]]
+            elif len(pubs) > 1 and is_cjk(pubs[0]):
+                # ordering is not reliable
+                pubs = [pubs[1], pubs[0]]
+            container_name = clean(pubs[0])
+            if len(pubs) > 1:
+                orig_container_name = pubs[1]
+                container_extra['original_name'] = clean(pubs[1])
+
+        if record.publisher:
+            pubs = [p.string.strip() for p in record.find_all("publisher")]
+            pubs = [p for p in pubs if p]
+            if len(pubs) > 1 and pubs[0] == pubs[1]:
+                pubs = [pubs[0]]
+            elif len(pubs) > 1 and is_cjk(pubs[0]):
+                # ordering is not reliable
+                pubs = [pubs[1], pubs[0]]
+            publisher = clean(pubs[0])
+            if len(pubs) > 1:
+                container_extra['publisher_alt_name'] = pubs[1]
+
+        if (container_id is None and self.create_containers and (issnl is not None)
+                and container_name):
+            # name, type, publisher, issnl
+            # extra: issnp, issne, original_name, languages, country
+            container_extra['country'] = 'jp'
+            container_extra['languages'] = ['ja']
+            ce = fatcat_client.ContainerEntity(
+                name=container_name,
+                container_type='journal',
+                publisher=publisher,
+                issnl=issnl,
+                extra=(container_extra or None))
+            ce_edit = self.create_container(ce)
+            container_id = ce_edit.ident
+
+        # the vast majority of works are in japanese
+        # TODO: any indication when *not* in japanese?
+        lang = "ja"
+
+        # reasonable default for this collection
+        release_type = "article-journal"
+
+        # external identifiers
+        extids = self.lookup_ext_ids(doi=doi)
+
+        # extra:
+        #   translation_of
+        #   aliases
+        #   container_name
+        #   group-title
+        # always put at least an empty dict here to indicate the DOI registrar
+        # (informally)
+        extra['jalc'] = extra_jalc
+
+        re = fatcat_client.ReleaseEntity(
+            work_id=None,
+            title=title,
+            original_title=original_title,
+            release_type="article-journal",
+            release_stage='published',
+            release_date=release_date,
+            release_year=release_year,
+            ext_ids=fatcat_client.ReleaseExtIds(
+                doi=doi,
+                pmid=extids['pmid'],
+                pmcid=extids['pmcid'],
+                wikidata_qid=extids['wikidata_qid'],
+                core=extids['core_id'],
+                arxiv=extids['arxiv_id'],
+                jstor=extids['jstor_id'],
+            ),
+            volume=volume,
+            issue=issue,
+            pages=pages,
+            publisher=publisher,
+            language=lang,
+            #license_slug
+
+            # content, mimetype, lang
+            #abstracts=abstracts,
+
+            # raw_name, role, raw_affiliation, extra
+            contribs=contribs,
+
+
+            extra=extra,
+        )
+        return re
+
+    def try_update(self, re):
+
+        # lookup existing DOI
+        existing = None
+        try:
+            existing = self.api.lookup_release(doi=re.ext_ids.doi)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+            # doesn't exist, need to insert
+            return True
+
+        # eventually we'll want to support "updates", but for now just skip if
+        # entity already exists
+        if existing:
+            self.counts['exists'] += 1
+            return False
+        return False
+
+    def insert_batch(self, batch):
+        self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch(
+            editgroup=fatcat_client.Editgroup(
+                description=self.editgroup_description,
+                extra=self.editgroup_extra),
+            entity_list=batch))
+
+    def parse_file(self, handle):
+        """
+        Helper for testing; can run this file stand-alone instead of using a pusher
+        """
+
+        # 1. open with beautiful soup
+        soup = BeautifulSoup(handle, "xml")
+
+        # 2. iterate over articles, call parse_article on each
+        for record in soup.find_all("Description"):
+            resp = self.parse_record(record)
+            #print(json.dumps(resp))
+            print(resp)
+            #sys.exit(-1)
+
+
+if __name__=='__main__':
+    parser = JalcXmlParser()
+    parser.parse_file(open(sys.argv[1]))
diff --git a/python/parse_jalc_xml.py b/python/parse_jalc_xml.py
deleted file mode 100644
index d7817df9..00000000
--- a/python/parse_jalc_xml.py
+++ /dev/null
@@ -1,209 +0,0 @@
-
-import sys
-import json
-import datetime
-import unicodedata
-from bs4 import BeautifulSoup
-from bs4.element import NavigableString
-
-
-DATE_FMT = "%Y-%m-%d"
-
-def is_cjk(s):
-    if not s:
-        return False
-    return unicodedata.name(s[0]).startswith("CJK")
-
-class JalcXmlParser():
-    """
-    Converts JALC DOI metadata (in XML/RDF format) to fatcat release entity
-
-    NOTE: some JALC DOIs seem to get cross-registered with Crossref
-    """
-
-    def __init__(self):
-        pass
-
-    def parse_file(self, handle):
-
-        # 1. open with beautiful soup
-        soup = BeautifulSoup(handle, "xml")
-
-        # 2. iterate over articles, call parse_article on each
-        for record in soup.find_all("Description"):
-            resp = self.parse_record(record)
-            print(json.dumps(resp))
-            #sys.exit(-1)
-
-
-    def parse_record(self, record):
-        """
-        In JALC metadata, both English and Japanese records are given for most
-        fields.
-        """
-
-        #extra = dict()
-        #extra_jalc = dict()
-
-        titles = record.find_all("title")
-        title = titles[0].string.strip()
-        original_title = None
-        if title.endswith('.'):
-            title = title[:-1]
-        if len(titles) > 1:
-            original_title = titles[1].string.strip()
-            if original_title.endswith('.'):
-                original_title = original_title[:-1]
-
-        doi = None
-        if record.doi:
-            doi = record.doi.string.lower().strip()
-            assert doi.startswith('10.')
-
-        contribs = []
-        people = record.find_all("Person")
-        if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string):
-            # both english and japanese names are included
-            for i in range(int(len(people)/2)):
-                # both english and japanese names are included for every author
-                eng = people[i*2]
-                jpn = people[i*2 + 1]
-                raw_name = eng.find('name')
-                orig_name = jpn.find('name')
-                if not raw_name:
-                    raw_name = orig_name
-                contrib = dict(
-                    raw_name=raw_name.string,
-                    role='author',
-                )
-                if raw_name and orig_name:
-                    contrib['extra'] = dict(original_name=orig_name.string)
-                contribs.append(contrib)
-        elif people:
-            for eng in people:
-                raw_name = eng.find('name')
-                contrib = dict(
-                    raw_name=eng.find('name').string,
-                    role='author',
-                )
-                contribs.append(contrib)
-
-        release_year = None
-        release_date = None
-        date = record.date or None
-        if date:
-            date = date.string
-            if len(date) is 10:
-                release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date()
-                release_year = release_date.year
-                release_date = release_date.isoformat()
-            elif len(date) is 4:
-                release_year = int(date)
-
-        pages = None
-        if record.startingPage:
-            pages = record.startingPage.string
-            if record.endingPage:
-                pages = "{}-{}".format(pages, record.endingPage.string)
-        volume = None
-        if record.volume:
-            volume = record.volume.string
-        issue = None
-        if record.number:
-            # note: number/issue transform
-            issue = record.number.string
-
-        issn = None
-        issn_list = record.find_all("issn")
-        if issn_list:
-            # if we wanted the other ISSNs, would also need to uniq the list.
-            # But we only need one to lookup ISSN-L/container
-            issn = issn_list[0].string
-
-        container = dict()
-        container_extra = dict()
-        container_name = None
-        if record.publicationName:
-            pubs = [p.string.strip() for p in record.find_all("publicationName")]
-            pubs = [p for p in pubs if p]
-            assert(pubs)
-            if len(pubs) > 1 and pubs[0] == pubs[1]:
-                pubs = [pubs[0]]
-            elif len(pubs) > 1 and is_cjk(pubs[0]):
-                # ordering is not reliable
-                pubs = [pubs[1], pubs[0]]
-            container_name = pubs[0]
-            container['name'] = container_name
-            if len(pubs) > 1:
-                orig_container_name = pubs[1]
-                container_extra['original_name'] = pubs[1]
-        publisher = None
-        if record.publisher:
-            pubs = [p.string.strip() for p in record.find_all("publisher")]
-            pubs = [p for p in pubs if p]
-            if len(pubs) > 1 and pubs[0] == pubs[1]:
-                pubs = [pubs[0]]
-            elif len(pubs) > 1 and is_cjk(pubs[0]):
-                # ordering is not reliable
-                pubs = [pubs[1], pubs[0]]
-            publisher = pubs[0]
-            container['publisher'] = publisher
-            if len(pubs) > 1:
-                container_extra['publisher_alt_name'] = pubs[1]
-        if container_extra:
-            container['extra'] = container_extra
-        if not container:
-            container = None
-
-        # the vast majority of works are in japanese
-        # TODO: any indication when *not* in japanese?
-        lang = "ja"
-
-        # reasonable default for this collection
-        release_type = "article-journal"
-
-        re = dict(
-            work_id=None,
-            title=title,
-            original_title=original_title,
-            release_type="article-journal",
-            release_status='submitted', # XXX: source_type?
-            release_date=release_date,
-            release_year=release_year,
-            #arxiv_id
-            doi=doi,
-            #pmid
-            #pmcid
-            #isbn13     # never in Article
-            volume=volume,
-            issue=issue,
-            pages=pages,
-            publisher=publisher,
-            language=lang,
-            #license_slug   # not in MEDLINE
-
-            # content, mimetype, lang
-            #abstracts=abstracts,
-
-            # raw_name, role, raw_affiliation, extra
-            contribs=contribs,
-
-            #   name, type, publisher, issnl
-            #   extra: issnp, issne, original_name, languages, country
-            container=container,
-
-            # extra:
-            #   withdrawn_date
-            #   translation_of
-            #   subtitle
-            #   aliases
-            #   container_name
-            #   group-title
-            #   pubmed: retraction refs
-            #extra=extra,
-        )
-        return re
-
-if __name__=='__main__':
-    parser = JalcXmlParser()
-    parser.parse_file(open(sys.argv[1]))
diff --git a/python/tests/import_jalc.py b/python/tests/import_jalc.py
new file mode 100644
index 00000000..7b25f0fa
--- /dev/null
+++ b/python/tests/import_jalc.py
@@ -0,0 +1,88 @@
+
+import json, gzip
+import pytest
+from fatcat_tools.importers import JalcImporter, Bs4XmlFilePusher
+from fixtures import api
+from bs4 import BeautifulSoup
+
+
+@pytest.fixture(scope="function")
+def jalc_importer(api):
+    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+        yield JalcImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True)
+
+@pytest.fixture(scope="function")
+def jalc_importer_existing(api):
+    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+        yield JalcImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False)
+
+def test_jalc_importer(jalc_importer):
+    last_index = jalc_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/jalc_lod_sample.xml', 'r') as f:
+        jalc_importer.bezerk_mode = True
+        counts = Bs4XmlFilePusher(jalc_importer, f, "Description").run()
+    assert counts['insert'] == 2
+    assert counts['exists'] == 0
+    assert counts['skip'] == 0
+
+    # fetch most recent editgroup
+    change = jalc_importer.api.get_changelog_entry(index=last_index+1)
+    eg = change.editgroup
+    assert eg.description
+    assert "jalc" in eg.description.lower()
+    assert eg.extra['git_rev']
+    assert "fatcat_tools.JalcImporter" in eg.extra['agent']
+
+    last_index = jalc_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/jalc_lod_sample.xml', 'r') as f:
+        jalc_importer.bezerk_mode = False
+        jalc_importer.reset()
+        counts = Bs4XmlFilePusher(jalc_importer, f, "Description").run()
+    assert counts['insert'] == 0
+    assert counts['exists'] == 2
+    assert counts['skip'] == 0
+    assert last_index == jalc_importer.api.get_changelog(limit=1)[0].index
+
+def test_jalc_dict_parse(jalc_importer):
+    with open('tests/files/jalc_lod_sample.xml', 'r') as f:
+        soup = BeautifulSoup(f, "xml")
+        r = jalc_importer.parse_record(soup.find_all("Description")[0])
+
+    print(r.extra)
+    assert r.title == "New carbides in the Ni-Ti-Mo-C system"
+    assert r.subtitle == None
+    assert r.original_title == "Ｎｉ－Ｔｉ－Ｍｏ－Ｃ系に出現する新炭化物相について"
+    assert r.publisher == "Japan Society of Powder and Powder Metallurgy"
+    assert r.release_type == "article-journal"
+    assert r.release_stage == "published"
+    assert r.license_slug == None
+    assert r.ext_ids.doi == "10.2497/jjspm.36.898"
+    assert r.language == "ja"
+    assert r.volume == "36"
+    assert r.issue == "8"
+    assert r.pages == "898-902"
+    assert r.release_year == 1989
+    # XXX:
+    #assert 'subtitle' not in r.extra
+    #assert 'subtitle' not in r.extra['jalc']
+    #assert 'funder' not in r.extra
+    #assert 'funder' not in r.extra['jalc']
+    # matched by ISSN, so shouldn't be in there?
+    #assert extra['container_name'] == "International Journal of Quantum Chemistry"
+    assert len(r.contribs) == 4
+
+    assert r.contribs[0].raw_name == "Hashimoto Yasuhiko"
+    assert r.contribs[0].given_name == "Yasuhiko"
+    assert r.contribs[0].surname == "Hashimoto"
+    assert r.contribs[0].extra['original_name']['raw_name'] == "橋本 雍彦"
+    assert r.contribs[0].extra['original_name']['given_name'] == "雍彦"
+    assert r.contribs[0].extra['original_name']['surname'] == "橋本"
+
+    assert r.contribs[3].raw_name == "Takahashi Teruo"
+    assert r.contribs[3].given_name == "Teruo"
+    assert r.contribs[3].surname == "Takahashi"
+    assert r.contribs[3].extra['original_name']['raw_name'] == "高橋 輝男"
+    assert r.contribs[3].extra['original_name']['given_name'] == "輝男"
+    assert r.contribs[3].extra['original_name']['surname'] == "高橋"
+
+    assert not r.refs