diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-15 12:02:55 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 | 
| commit | 82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a (patch) | |
| tree | 930331468462a74873aebb44b88c051e8b096c4f | |
| parent | 4cff530fa3a49e845a2c21bbc85d74a92a3e2b06 (diff) | |
| download | fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.tar.gz fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.zip | |
initial flesh out of JALC parser
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 3 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 36 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/jalc.py | 310 | ||||
| -rw-r--r-- | python/parse_jalc_xml.py | 209 | ||||
| -rw-r--r-- | python/tests/import_jalc.py | 88 | 
5 files changed, 436 insertions, 210 deletions
| diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index f5ff43e5..ecbfe38e 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -12,8 +12,9 @@ To run an import you combine two classes; one each of:  """ -from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, KafkaJsonPusher, make_kafka_consumer, clean +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk  from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP +from .jalc import JalcImporter  from .grobid_metadata import GrobidMetadataImporter  from .journal_metadata import JournalMetadataImporter  from .matched import MatchedImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 282f775c..7fca38cf 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -7,13 +7,16 @@ import ftfy  import sqlite3  import itertools  import subprocess +import unicodedata  from collections import Counter  import pykafka +from bs4 import BeautifulSoup  import fatcat_client  from fatcat_client.rest import ApiException +DATE_FMT = "%Y-%m-%d"  SANE_MAX_RELEASES = 200  SANE_MAX_URLS = 100 @@ -52,6 +55,23 @@ def test_clean():      assert clean('<b>a&b</b>') == '<b>a&b</b>'      assert clean('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' +def is_cjk(s): +    if not s: +        return False +    return unicodedata.name(s[0]).startswith("CJK") + +def test_is_cjk(): +    assert is_cjk(None) == False +    assert is_cjk('') == False +    assert is_cjk('blah') == False +    assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True +    assert is_cjk('菊') == True +    assert is_cjk('ひヒ') == True +    assert is_cjk('english with ひヒ') == True +    assert is_cjk('き゚ゅ') == True +    assert is_cjk('水道') == True +    assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True +  DOMAIN_REL_MAP = {      "archive.org": "archive",      # LOCKSS, Portico, DuraSpace, etc would also be "archive" @@ -456,6 +476,22 @@ class SqlitePusher(RecordPusher):          return counts +class Bs4XmlFilePusher(RecordPusher): + +    def __init__(self, importer, xml_file, record_tag, **kwargs): +        self.importer = importer +        self.xml_file = xml_file +        self.record_tag = record_tag + +    def run(self): +        soup = BeautifulSoup(self.xml_file, "xml") +        for record in soup.find_all(self.record_tag): +            self.importer.push_record(record) +        counts = self.importer.finish() +        print(counts) +        return counts + +  class KafkaJsonPusher(RecordPusher):      def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py new file mode 100644 index 00000000..d7b89727 --- /dev/null +++ b/python/fatcat_tools/importers/jalc.py @@ -0,0 +1,310 @@ + +import sys +import json +import sqlite3 +import datetime +import itertools +import subprocess +from bs4 import BeautifulSoup + +import fatcat_client +from .common import EntityImporter, clean, is_cjk, DATE_FMT + + +class JalcImporter(EntityImporter): +    """ +    Importer for JALC DOI metadata. + +    NOTE: some JALC DOIs seem to get cross-registered with Crossref +    """ + +    def __init__(self, api, issn_map_file, **kwargs): + +        eg_desc = kwargs.get('editgroup_description', +            "Automated import of JALC DOI metadata") +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JalcImporter') +        super().__init__(api, +            issn_map_file=issn_map_file, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            **kwargs) + +        self.create_containers = kwargs.get('create_containers') +        extid_map_file = kwargs.get('extid_map_file') +        self.extid_map_db = None +        if extid_map_file: +            db_uri = "file:{}?mode=ro".format(extid_map_file) +            print("Using external ID map: {}".format(db_uri)) +            self.extid_map_db = sqlite3.connect(db_uri, uri=True) +        else: +            print("Not using external ID map") + +        self.read_issn_map_file(issn_map_file) + +    def lookup_ext_ids(self, doi): +        if self.extid_map_db is None: +            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) +        row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", +            [doi.lower()]).fetchone() +        if row is None: +            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) +        row = [str(cell or '') or None for cell in row] +        return dict( +            core_id=row[0], +            pmid=row[1], +            pmcid=row[2], +            wikidata_qid=row[3], +            # TODO: +            arxiv_id=None, +            jstor_id=None, +        ) + +    def want(self, obj): +        return True + +    def parse_record(self, record): +        """ +        record is a beautiful soup object +        returns a ReleaseEntity, or None + +        In JALC metadata, both English and Japanese records are given for most +        fields. +        """ + +        extra = dict() +        extra_jalc = dict() + +        titles = record.find_all("title") +        title = titles[0].string.strip() +        original_title = None +        if title.endswith('.'): +            title = title[:-1] +        if len(titles) > 1: +            original_title = titles[1].string.strip() +            if original_title.endswith('.'): +                original_title = original_title[:-1] + +        doi = None +        if record.doi: +            doi = record.doi.string.lower().strip() +            assert doi.startswith('10.') + +        contribs = [] +        people = record.find_all("Person") +        if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string): +            # both english and japanese names are included for every author +            for i in range(int(len(people)/2)): +                eng = people[i*2] +                jpn = people[i*2 + 1] +                # there isn't always an english name though? TODO +                name = eng +                if not name.find('name'): +                    name = jpn +                contrib = fatcat_client.ReleaseContrib( +                    raw_name=clean(name.find('name').string), +                    given_name=clean(name.find('givenName').string), +                    surname=clean(name.find('familyName').string), +                    role='author', +                ) +                if eng.find('name') and jpn.find('name'): +                    contrib.extra = { +                        'original_name': { +                            'lang': 'ja', +                            'raw_name': clean(jpn.find('name').string), +                            'given_name': clean(jpn.find('givenName').string), +                            'surname': clean(jpn.find('familyName').string), +                        }} +                contribs.append(contrib) +        elif people: +            # TODO: test for this codepath? +            for eng in people: +                contrib = dict( +                    raw_name=clean(eng.find('name').string), +                    given_name=clean(eng.find('givenName').string), +                    surname=clean(eng.find('familyName').string), +                    role='author', +                ) +                contribs.append(contrib) + +        release_year = None +        release_date = None +        date = record.date or None +        if date: +            date = date.string +            if len(date) is 10: +                release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date() +                release_year = release_date.year +                release_date = release_date.isoformat() +            elif len(date) is 4: +                release_year = int(date) + +        pages = None +        if record.startingPage: +            pages = record.startingPage.string +            if record.endingPage: +                pages = "{}-{}".format(pages, record.endingPage.string) +        volume = None +        if record.volume: +            volume = record.volume.string +        issue = None +        if record.number: +            # note: number/issue transform +            issue = record.number.string + +        # container +        issn = None +        issn_list = record.find_all("issn") +        if issn_list: +            # if we wanted the other ISSNs, would also need to uniq the list. +            # But we only need one to lookup ISSN-L/container +            issn = issn_list[0].string +        issnl = self.issn2issnl(issn) +        container_id = None +        if issnl: +            container_id = self.lookup_issnl(issnl) + +        publisher = None +        container_name = None +        container_extra = dict() + +        if record.publicationName: +            pubs = [p.string.strip() for p in record.find_all("publicationName")] +            pubs = [p for p in pubs if p] +            assert(pubs) +            if len(pubs) > 1 and pubs[0] == pubs[1]: +                pubs = [pubs[0]] +            elif len(pubs) > 1 and is_cjk(pubs[0]): +                # ordering is not reliable +                pubs = [pubs[1], pubs[0]] +            container_name = clean(pubs[0]) +            if len(pubs) > 1: +                orig_container_name = pubs[1] +                container_extra['original_name'] = clean(pubs[1]) + +        if record.publisher: +            pubs = [p.string.strip() for p in record.find_all("publisher")] +            pubs = [p for p in pubs if p] +            if len(pubs) > 1 and pubs[0] == pubs[1]: +                pubs = [pubs[0]] +            elif len(pubs) > 1 and is_cjk(pubs[0]): +                # ordering is not reliable +                pubs = [pubs[1], pubs[0]] +            publisher = clean(pubs[0]) +            if len(pubs) > 1: +                container_extra['publisher_alt_name'] = pubs[1] + +        if (container_id is None and self.create_containers and (issnl is not None) +                and container_name): +            # name, type, publisher, issnl +            # extra: issnp, issne, original_name, languages, country +            container_extra['country'] = 'jp' +            container_extra['languages'] = ['ja'] +            ce = fatcat_client.ContainerEntity( +                name=container_name, +                container_type='journal', +                publisher=publisher, +                issnl=issnl, +                extra=(container_extra or None)) +            ce_edit = self.create_container(ce) +            container_id = ce_edit.ident + +        # the vast majority of works are in japanese +        # TODO: any indication when *not* in japanese? +        lang = "ja" + +        # reasonable default for this collection +        release_type = "article-journal" + +        # external identifiers +        extids = self.lookup_ext_ids(doi=doi) + +        # extra: +        #   translation_of +        #   aliases +        #   container_name +        #   group-title +        # always put at least an empty dict here to indicate the DOI registrar +        # (informally) +        extra['jalc'] = extra_jalc + +        re = fatcat_client.ReleaseEntity( +            work_id=None, +            title=title, +            original_title=original_title, +            release_type="article-journal", +            release_stage='published', +            release_date=release_date, +            release_year=release_year, +            ext_ids=fatcat_client.ReleaseExtIds( +                doi=doi, +                pmid=extids['pmid'], +                pmcid=extids['pmcid'], +                wikidata_qid=extids['wikidata_qid'], +                core=extids['core_id'], +                arxiv=extids['arxiv_id'], +                jstor=extids['jstor_id'], +            ), +            volume=volume, +            issue=issue, +            pages=pages, +            publisher=publisher, +            language=lang, +            #license_slug + +            # content, mimetype, lang +            #abstracts=abstracts, + +            # raw_name, role, raw_affiliation, extra +            contribs=contribs, + + +            extra=extra, +        ) +        return re + +    def try_update(self, re): + +        # lookup existing DOI +        existing = None +        try: +            existing = self.api.lookup_release(doi=re.ext_ids.doi) +        except fatcat_client.rest.ApiException as err: +            if err.status != 404: +                raise err +            # doesn't exist, need to insert +            return True + +        # eventually we'll want to support "updates", but for now just skip if +        # entity already exists +        if existing: +            self.counts['exists'] += 1 +            return False +        return False + +    def insert_batch(self, batch): +        self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch( +            editgroup=fatcat_client.Editgroup( +                description=self.editgroup_description, +                extra=self.editgroup_extra), +            entity_list=batch)) + +    def parse_file(self, handle): +        """ +        Helper for testing; can run this file stand-alone instead of using a pusher +        """ + +        # 1. open with beautiful soup +        soup = BeautifulSoup(handle, "xml") + +        # 2. iterate over articles, call parse_article on each +        for record in soup.find_all("Description"): +            resp = self.parse_record(record) +            #print(json.dumps(resp)) +            print(resp) +            #sys.exit(-1) + + +if __name__=='__main__': +    parser = JalcXmlParser() +    parser.parse_file(open(sys.argv[1])) diff --git a/python/parse_jalc_xml.py b/python/parse_jalc_xml.py deleted file mode 100644 index d7817df9..00000000 --- a/python/parse_jalc_xml.py +++ /dev/null @@ -1,209 +0,0 @@ - -import sys -import json -import datetime -import unicodedata -from bs4 import BeautifulSoup -from bs4.element import NavigableString - - -DATE_FMT = "%Y-%m-%d" - -def is_cjk(s): -    if not s: -        return False -    return unicodedata.name(s[0]).startswith("CJK") - -class JalcXmlParser(): -    """ -    Converts JALC DOI metadata (in XML/RDF format) to fatcat release entity - -    NOTE: some JALC DOIs seem to get cross-registered with Crossref -    """ - -    def __init__(self): -        pass - -    def parse_file(self, handle): - -        # 1. open with beautiful soup -        soup = BeautifulSoup(handle, "xml") - -        # 2. iterate over articles, call parse_article on each -        for record in soup.find_all("Description"): -            resp = self.parse_record(record) -            print(json.dumps(resp)) -            #sys.exit(-1) - - -    def parse_record(self, record): -        """ -        In JALC metadata, both English and Japanese records are given for most -        fields. -        """ - -        #extra = dict() -        #extra_jalc = dict() - -        titles = record.find_all("title") -        title = titles[0].string.strip() -        original_title = None -        if title.endswith('.'): -            title = title[:-1] -        if len(titles) > 1: -            original_title = titles[1].string.strip() -            if original_title.endswith('.'): -                original_title = original_title[:-1] - -        doi = None -        if record.doi: -            doi = record.doi.string.lower().strip() -            assert doi.startswith('10.') - -        contribs = [] -        people = record.find_all("Person") -        if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string): -            # both english and japanese names are included -            for i in range(int(len(people)/2)): -                # both english and japanese names are included for every author -                eng = people[i*2] -                jpn = people[i*2 + 1] -                raw_name = eng.find('name') -                orig_name = jpn.find('name') -                if not raw_name: -                    raw_name = orig_name -                contrib = dict( -                    raw_name=raw_name.string, -                    role='author', -                ) -                if raw_name and orig_name: -                    contrib['extra'] = dict(original_name=orig_name.string) -                contribs.append(contrib) -        elif people: -            for eng in people: -                raw_name = eng.find('name') -                contrib = dict( -                    raw_name=eng.find('name').string, -                    role='author', -                ) -                contribs.append(contrib) - -        release_year = None -        release_date = None -        date = record.date or None -        if date: -            date = date.string -            if len(date) is 10: -                release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date() -                release_year = release_date.year -                release_date = release_date.isoformat() -            elif len(date) is 4: -                release_year = int(date) - -        pages = None -        if record.startingPage: -            pages = record.startingPage.string -            if record.endingPage: -                pages = "{}-{}".format(pages, record.endingPage.string) -        volume = None -        if record.volume: -            volume = record.volume.string -        issue = None -        if record.number: -            # note: number/issue transform -            issue = record.number.string - -        issn = None -        issn_list = record.find_all("issn") -        if issn_list: -            # if we wanted the other ISSNs, would also need to uniq the list. -            # But we only need one to lookup ISSN-L/container -            issn = issn_list[0].string - -        container = dict() -        container_extra = dict() -        container_name = None -        if record.publicationName: -            pubs = [p.string.strip() for p in record.find_all("publicationName")] -            pubs = [p for p in pubs if p] -            assert(pubs) -            if len(pubs) > 1 and pubs[0] == pubs[1]: -                pubs = [pubs[0]] -            elif len(pubs) > 1 and is_cjk(pubs[0]): -                # ordering is not reliable -                pubs = [pubs[1], pubs[0]] -            container_name = pubs[0] -            container['name'] = container_name -            if len(pubs) > 1: -                orig_container_name = pubs[1] -                container_extra['original_name'] = pubs[1] -        publisher = None -        if record.publisher: -            pubs = [p.string.strip() for p in record.find_all("publisher")] -            pubs = [p for p in pubs if p] -            if len(pubs) > 1 and pubs[0] == pubs[1]: -                pubs = [pubs[0]] -            elif len(pubs) > 1 and is_cjk(pubs[0]): -                # ordering is not reliable -                pubs = [pubs[1], pubs[0]] -            publisher = pubs[0] -            container['publisher'] = publisher -            if len(pubs) > 1: -                container_extra['publisher_alt_name'] = pubs[1] -        if container_extra: -            container['extra'] = container_extra -        if not container: -            container = None - -        # the vast majority of works are in japanese -        # TODO: any indication when *not* in japanese? -        lang = "ja" - -        # reasonable default for this collection -        release_type = "article-journal" - -        re = dict( -            work_id=None, -            title=title, -            original_title=original_title, -            release_type="article-journal", -            release_status='submitted', # XXX: source_type? -            release_date=release_date, -            release_year=release_year, -            #arxiv_id -            doi=doi, -            #pmid -            #pmcid -            #isbn13     # never in Article -            volume=volume, -            issue=issue, -            pages=pages, -            publisher=publisher, -            language=lang, -            #license_slug   # not in MEDLINE - -            # content, mimetype, lang -            #abstracts=abstracts, - -            # raw_name, role, raw_affiliation, extra -            contribs=contribs, - -            #   name, type, publisher, issnl -            #   extra: issnp, issne, original_name, languages, country -            container=container, - -            # extra: -            #   withdrawn_date -            #   translation_of -            #   subtitle -            #   aliases -            #   container_name -            #   group-title -            #   pubmed: retraction refs -            #extra=extra, -        ) -        return re - -if __name__=='__main__': -    parser = JalcXmlParser() -    parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/import_jalc.py b/python/tests/import_jalc.py new file mode 100644 index 00000000..7b25f0fa --- /dev/null +++ b/python/tests/import_jalc.py @@ -0,0 +1,88 @@ + +import json, gzip +import pytest +from fatcat_tools.importers import JalcImporter, Bs4XmlFilePusher +from fixtures import api +from bs4 import BeautifulSoup + + +@pytest.fixture(scope="function") +def jalc_importer(api): +    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: +        yield JalcImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True) + +@pytest.fixture(scope="function") +def jalc_importer_existing(api): +    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: +        yield JalcImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False) + +def test_jalc_importer(jalc_importer): +    last_index = jalc_importer.api.get_changelog(limit=1)[0].index +    with open('tests/files/jalc_lod_sample.xml', 'r') as f: +        jalc_importer.bezerk_mode = True +        counts = Bs4XmlFilePusher(jalc_importer, f, "Description").run() +    assert counts['insert'] == 2 +    assert counts['exists'] == 0 +    assert counts['skip'] == 0 + +    # fetch most recent editgroup +    change = jalc_importer.api.get_changelog_entry(index=last_index+1) +    eg = change.editgroup +    assert eg.description +    assert "jalc" in eg.description.lower() +    assert eg.extra['git_rev'] +    assert "fatcat_tools.JalcImporter" in eg.extra['agent'] + +    last_index = jalc_importer.api.get_changelog(limit=1)[0].index +    with open('tests/files/jalc_lod_sample.xml', 'r') as f: +        jalc_importer.bezerk_mode = False +        jalc_importer.reset() +        counts = Bs4XmlFilePusher(jalc_importer, f, "Description").run() +    assert counts['insert'] == 0 +    assert counts['exists'] == 2 +    assert counts['skip'] == 0 +    assert last_index == jalc_importer.api.get_changelog(limit=1)[0].index + +def test_jalc_dict_parse(jalc_importer): +    with open('tests/files/jalc_lod_sample.xml', 'r') as f: +        soup = BeautifulSoup(f, "xml") +        r = jalc_importer.parse_record(soup.find_all("Description")[0]) + +    print(r.extra) +    assert r.title == "New carbides in the Ni-Ti-Mo-C system" +    assert r.subtitle == None +    assert r.original_title == "Ni-Ti-Mo-C系に出現する新炭化物相について" +    assert r.publisher == "Japan Society of Powder and Powder Metallurgy" +    assert r.release_type == "article-journal" +    assert r.release_stage == "published" +    assert r.license_slug == None +    assert r.ext_ids.doi == "10.2497/jjspm.36.898" +    assert r.language == "ja" +    assert r.volume == "36" +    assert r.issue == "8" +    assert r.pages == "898-902" +    assert r.release_year == 1989 +    # XXX: +    #assert 'subtitle' not in r.extra +    #assert 'subtitle' not in r.extra['jalc'] +    #assert 'funder' not in r.extra +    #assert 'funder' not in r.extra['jalc'] +    # matched by ISSN, so shouldn't be in there? +    #assert extra['container_name'] == "International Journal of Quantum Chemistry" +    assert len(r.contribs) == 4 + +    assert r.contribs[0].raw_name == "Hashimoto Yasuhiko" +    assert r.contribs[0].given_name == "Yasuhiko" +    assert r.contribs[0].surname == "Hashimoto" +    assert r.contribs[0].extra['original_name']['raw_name'] == "橋本 雍彦" +    assert r.contribs[0].extra['original_name']['given_name'] == "雍彦" +    assert r.contribs[0].extra['original_name']['surname'] == "橋本" + +    assert r.contribs[3].raw_name == "Takahashi Teruo" +    assert r.contribs[3].given_name == "Teruo" +    assert r.contribs[3].surname == "Takahashi" +    assert r.contribs[3].extra['original_name']['raw_name'] == "高橋 輝男" +    assert r.contribs[3].extra['original_name']['given_name'] == "輝男" +    assert r.contribs[3].extra['original_name']['surname'] == "高橋" + +    assert not r.refs | 
