diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-15 12:02:55 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 |
commit | 82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a (patch) | |
tree | 930331468462a74873aebb44b88c051e8b096c4f | |
parent | 4cff530fa3a49e845a2c21bbc85d74a92a3e2b06 (diff) | |
download | fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.tar.gz fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.zip |
initial flesh out of JALC parser
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 3 | ||||
-rw-r--r-- | python/fatcat_tools/importers/common.py | 36 | ||||
-rw-r--r-- | python/fatcat_tools/importers/jalc.py | 310 | ||||
-rw-r--r-- | python/parse_jalc_xml.py | 209 | ||||
-rw-r--r-- | python/tests/import_jalc.py | 88 |
5 files changed, 436 insertions, 210 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index f5ff43e5..ecbfe38e 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -12,8 +12,9 @@ To run an import you combine two classes; one each of: """ -from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, KafkaJsonPusher, make_kafka_consumer, clean +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP +from .jalc import JalcImporter from .grobid_metadata import GrobidMetadataImporter from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 282f775c..7fca38cf 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -7,13 +7,16 @@ import ftfy import sqlite3 import itertools import subprocess +import unicodedata from collections import Counter import pykafka +from bs4 import BeautifulSoup import fatcat_client from fatcat_client.rest import ApiException +DATE_FMT = "%Y-%m-%d" SANE_MAX_RELEASES = 200 SANE_MAX_URLS = 100 @@ -52,6 +55,23 @@ def test_clean(): assert clean('<b>a&b</b>') == '<b>a&b</b>' assert clean('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' +def is_cjk(s): + if not s: + return False + return unicodedata.name(s[0]).startswith("CJK") + +def test_is_cjk(): + assert is_cjk(None) == False + assert is_cjk('') == False + assert is_cjk('blah') == False + assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True + assert is_cjk('菊') == True + assert is_cjk('ひヒ') == True + assert is_cjk('english with ひヒ') == True + assert is_cjk('き゚ゅ') == True + assert is_cjk('水道') == True + assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True + DOMAIN_REL_MAP = { "archive.org": "archive", # LOCKSS, Portico, DuraSpace, etc would also be "archive" @@ -456,6 +476,22 @@ class SqlitePusher(RecordPusher): return counts +class Bs4XmlFilePusher(RecordPusher): + + def __init__(self, importer, xml_file, record_tag, **kwargs): + self.importer = importer + self.xml_file = xml_file + self.record_tag = record_tag + + def run(self): + soup = BeautifulSoup(self.xml_file, "xml") + for record in soup.find_all(self.record_tag): + self.importer.push_record(record) + counts = self.importer.finish() + print(counts) + return counts + + class KafkaJsonPusher(RecordPusher): def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py new file mode 100644 index 00000000..d7b89727 --- /dev/null +++ b/python/fatcat_tools/importers/jalc.py @@ -0,0 +1,310 @@ + +import sys +import json +import sqlite3 +import datetime +import itertools +import subprocess +from bs4 import BeautifulSoup + +import fatcat_client +from .common import EntityImporter, clean, is_cjk, DATE_FMT + + +class JalcImporter(EntityImporter): + """ + Importer for JALC DOI metadata. + + NOTE: some JALC DOIs seem to get cross-registered with Crossref + """ + + def __init__(self, api, issn_map_file, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of JALC DOI metadata") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JalcImporter') + super().__init__(api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + self.create_containers = kwargs.get('create_containers') + extid_map_file = kwargs.get('extid_map_file') + self.extid_map_db = None + if extid_map_file: + db_uri = "file:{}?mode=ro".format(extid_map_file) + print("Using external ID map: {}".format(db_uri)) + self.extid_map_db = sqlite3.connect(db_uri, uri=True) + else: + print("Not using external ID map") + + self.read_issn_map_file(issn_map_file) + + def lookup_ext_ids(self, doi): + if self.extid_map_db is None: + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) + row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", + [doi.lower()]).fetchone() + if row is None: + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) + row = [str(cell or '') or None for cell in row] + return dict( + core_id=row[0], + pmid=row[1], + pmcid=row[2], + wikidata_qid=row[3], + # TODO: + arxiv_id=None, + jstor_id=None, + ) + + def want(self, obj): + return True + + def parse_record(self, record): + """ + record is a beautiful soup object + returns a ReleaseEntity, or None + + In JALC metadata, both English and Japanese records are given for most + fields. + """ + + extra = dict() + extra_jalc = dict() + + titles = record.find_all("title") + title = titles[0].string.strip() + original_title = None + if title.endswith('.'): + title = title[:-1] + if len(titles) > 1: + original_title = titles[1].string.strip() + if original_title.endswith('.'): + original_title = original_title[:-1] + + doi = None + if record.doi: + doi = record.doi.string.lower().strip() + assert doi.startswith('10.') + + contribs = [] + people = record.find_all("Person") + if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string): + # both english and japanese names are included for every author + for i in range(int(len(people)/2)): + eng = people[i*2] + jpn = people[i*2 + 1] + # there isn't always an english name though? TODO + name = eng + if not name.find('name'): + name = jpn + contrib = fatcat_client.ReleaseContrib( + raw_name=clean(name.find('name').string), + given_name=clean(name.find('givenName').string), + surname=clean(name.find('familyName').string), + role='author', + ) + if eng.find('name') and jpn.find('name'): + contrib.extra = { + 'original_name': { + 'lang': 'ja', + 'raw_name': clean(jpn.find('name').string), + 'given_name': clean(jpn.find('givenName').string), + 'surname': clean(jpn.find('familyName').string), + }} + contribs.append(contrib) + elif people: + # TODO: test for this codepath? + for eng in people: + contrib = dict( + raw_name=clean(eng.find('name').string), + given_name=clean(eng.find('givenName').string), + surname=clean(eng.find('familyName').string), + role='author', + ) + contribs.append(contrib) + + release_year = None + release_date = None + date = record.date or None + if date: + date = date.string + if len(date) is 10: + release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date() + release_year = release_date.year + release_date = release_date.isoformat() + elif len(date) is 4: + release_year = int(date) + + pages = None + if record.startingPage: + pages = record.startingPage.string + if record.endingPage: + pages = "{}-{}".format(pages, record.endingPage.string) + volume = None + if record.volume: + volume = record.volume.string + issue = None + if record.number: + # note: number/issue transform + issue = record.number.string + + # container + issn = None + issn_list = record.find_all("issn") + if issn_list: + # if we wanted the other ISSNs, would also need to uniq the list. + # But we only need one to lookup ISSN-L/container + issn = issn_list[0].string + issnl = self.issn2issnl(issn) + container_id = None + if issnl: + container_id = self.lookup_issnl(issnl) + + publisher = None + container_name = None + container_extra = dict() + + if record.publicationName: + pubs = [p.string.strip() for p in record.find_all("publicationName")] + pubs = [p for p in pubs if p] + assert(pubs) + if len(pubs) > 1 and pubs[0] == pubs[1]: + pubs = [pubs[0]] + elif len(pubs) > 1 and is_cjk(pubs[0]): + # ordering is not reliable + pubs = [pubs[1], pubs[0]] + container_name = clean(pubs[0]) + if len(pubs) > 1: + orig_container_name = pubs[1] + container_extra['original_name'] = clean(pubs[1]) + + if record.publisher: + pubs = [p.string.strip() for p in record.find_all("publisher")] + pubs = [p for p in pubs if p] + if len(pubs) > 1 and pubs[0] == pubs[1]: + pubs = [pubs[0]] + elif len(pubs) > 1 and is_cjk(pubs[0]): + # ordering is not reliable + pubs = [pubs[1], pubs[0]] + publisher = clean(pubs[0]) + if len(pubs) > 1: + container_extra['publisher_alt_name'] = pubs[1] + + if (container_id is None and self.create_containers and (issnl is not None) + and container_name): + # name, type, publisher, issnl + # extra: issnp, issne, original_name, languages, country + container_extra['country'] = 'jp' + container_extra['languages'] = ['ja'] + ce = fatcat_client.ContainerEntity( + name=container_name, + container_type='journal', + publisher=publisher, + issnl=issnl, + extra=(container_extra or None)) + ce_edit = self.create_container(ce) + container_id = ce_edit.ident + + # the vast majority of works are in japanese + # TODO: any indication when *not* in japanese? + lang = "ja" + + # reasonable default for this collection + release_type = "article-journal" + + # external identifiers + extids = self.lookup_ext_ids(doi=doi) + + # extra: + # translation_of + # aliases + # container_name + # group-title + # always put at least an empty dict here to indicate the DOI registrar + # (informally) + extra['jalc'] = extra_jalc + + re = fatcat_client.ReleaseEntity( + work_id=None, + title=title, + original_title=original_title, + release_type="article-journal", + release_stage='published', + release_date=release_date, + release_year=release_year, + ext_ids=fatcat_client.ReleaseExtIds( + doi=doi, + pmid=extids['pmid'], + pmcid=extids['pmcid'], + wikidata_qid=extids['wikidata_qid'], + core=extids['core_id'], + arxiv=extids['arxiv_id'], + jstor=extids['jstor_id'], + ), + volume=volume, + issue=issue, + pages=pages, + publisher=publisher, + language=lang, + #license_slug + + # content, mimetype, lang + #abstracts=abstracts, + + # raw_name, role, raw_affiliation, extra + contribs=contribs, + + + extra=extra, + ) + return re + + def try_update(self, re): + + # lookup existing DOI + existing = None + try: + existing = self.api.lookup_release(doi=re.ext_ids.doi) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + # doesn't exist, need to insert + return True + + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing: + self.counts['exists'] += 1 + return False + return False + + def insert_batch(self, batch): + self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch( + editgroup=fatcat_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + + def parse_file(self, handle): + """ + Helper for testing; can run this file stand-alone instead of using a pusher + """ + + # 1. open with beautiful soup + soup = BeautifulSoup(handle, "xml") + + # 2. iterate over articles, call parse_article on each + for record in soup.find_all("Description"): + resp = self.parse_record(record) + #print(json.dumps(resp)) + print(resp) + #sys.exit(-1) + + +if __name__=='__main__': + parser = JalcXmlParser() + parser.parse_file(open(sys.argv[1])) diff --git a/python/parse_jalc_xml.py b/python/parse_jalc_xml.py deleted file mode 100644 index d7817df9..00000000 --- a/python/parse_jalc_xml.py +++ /dev/null @@ -1,209 +0,0 @@ - -import sys -import json -import datetime -import unicodedata -from bs4 import BeautifulSoup -from bs4.element import NavigableString - - -DATE_FMT = "%Y-%m-%d" - -def is_cjk(s): - if not s: - return False - return unicodedata.name(s[0]).startswith("CJK") - -class JalcXmlParser(): - """ - Converts JALC DOI metadata (in XML/RDF format) to fatcat release entity - - NOTE: some JALC DOIs seem to get cross-registered with Crossref - """ - - def __init__(self): - pass - - def parse_file(self, handle): - - # 1. open with beautiful soup - soup = BeautifulSoup(handle, "xml") - - # 2. iterate over articles, call parse_article on each - for record in soup.find_all("Description"): - resp = self.parse_record(record) - print(json.dumps(resp)) - #sys.exit(-1) - - - def parse_record(self, record): - """ - In JALC metadata, both English and Japanese records are given for most - fields. - """ - - #extra = dict() - #extra_jalc = dict() - - titles = record.find_all("title") - title = titles[0].string.strip() - original_title = None - if title.endswith('.'): - title = title[:-1] - if len(titles) > 1: - original_title = titles[1].string.strip() - if original_title.endswith('.'): - original_title = original_title[:-1] - - doi = None - if record.doi: - doi = record.doi.string.lower().strip() - assert doi.startswith('10.') - - contribs = [] - people = record.find_all("Person") - if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string): - # both english and japanese names are included - for i in range(int(len(people)/2)): - # both english and japanese names are included for every author - eng = people[i*2] - jpn = people[i*2 + 1] - raw_name = eng.find('name') - orig_name = jpn.find('name') - if not raw_name: - raw_name = orig_name - contrib = dict( - raw_name=raw_name.string, - role='author', - ) - if raw_name and orig_name: - contrib['extra'] = dict(original_name=orig_name.string) - contribs.append(contrib) - elif people: - for eng in people: - raw_name = eng.find('name') - contrib = dict( - raw_name=eng.find('name').string, - role='author', - ) - contribs.append(contrib) - - release_year = None - release_date = None - date = record.date or None - if date: - date = date.string - if len(date) is 10: - release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date() - release_year = release_date.year - release_date = release_date.isoformat() - elif len(date) is 4: - release_year = int(date) - - pages = None - if record.startingPage: - pages = record.startingPage.string - if record.endingPage: - pages = "{}-{}".format(pages, record.endingPage.string) - volume = None - if record.volume: - volume = record.volume.string - issue = None - if record.number: - # note: number/issue transform - issue = record.number.string - - issn = None - issn_list = record.find_all("issn") - if issn_list: - # if we wanted the other ISSNs, would also need to uniq the list. - # But we only need one to lookup ISSN-L/container - issn = issn_list[0].string - - container = dict() - container_extra = dict() - container_name = None - if record.publicationName: - pubs = [p.string.strip() for p in record.find_all("publicationName")] - pubs = [p for p in pubs if p] - assert(pubs) - if len(pubs) > 1 and pubs[0] == pubs[1]: - pubs = [pubs[0]] - elif len(pubs) > 1 and is_cjk(pubs[0]): - # ordering is not reliable - pubs = [pubs[1], pubs[0]] - container_name = pubs[0] - container['name'] = container_name - if len(pubs) > 1: - orig_container_name = pubs[1] - container_extra['original_name'] = pubs[1] - publisher = None - if record.publisher: - pubs = [p.string.strip() for p in record.find_all("publisher")] - pubs = [p for p in pubs if p] - if len(pubs) > 1 and pubs[0] == pubs[1]: - pubs = [pubs[0]] - elif len(pubs) > 1 and is_cjk(pubs[0]): - # ordering is not reliable - pubs = [pubs[1], pubs[0]] - publisher = pubs[0] - container['publisher'] = publisher - if len(pubs) > 1: - container_extra['publisher_alt_name'] = pubs[1] - if container_extra: - container['extra'] = container_extra - if not container: - container = None - - # the vast majority of works are in japanese - # TODO: any indication when *not* in japanese? - lang = "ja" - - # reasonable default for this collection - release_type = "article-journal" - - re = dict( - work_id=None, - title=title, - original_title=original_title, - release_type="article-journal", - release_status='submitted', # XXX: source_type? - release_date=release_date, - release_year=release_year, - #arxiv_id - doi=doi, - #pmid - #pmcid - #isbn13 # never in Article - volume=volume, - issue=issue, - pages=pages, - publisher=publisher, - language=lang, - #license_slug # not in MEDLINE - - # content, mimetype, lang - #abstracts=abstracts, - - # raw_name, role, raw_affiliation, extra - contribs=contribs, - - # name, type, publisher, issnl - # extra: issnp, issne, original_name, languages, country - container=container, - - # extra: - # withdrawn_date - # translation_of - # subtitle - # aliases - # container_name - # group-title - # pubmed: retraction refs - #extra=extra, - ) - return re - -if __name__=='__main__': - parser = JalcXmlParser() - parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/import_jalc.py b/python/tests/import_jalc.py new file mode 100644 index 00000000..7b25f0fa --- /dev/null +++ b/python/tests/import_jalc.py @@ -0,0 +1,88 @@ + +import json, gzip +import pytest +from fatcat_tools.importers import JalcImporter, Bs4XmlFilePusher +from fixtures import api +from bs4 import BeautifulSoup + + +@pytest.fixture(scope="function") +def jalc_importer(api): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield JalcImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True) + +@pytest.fixture(scope="function") +def jalc_importer_existing(api): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield JalcImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False) + +def test_jalc_importer(jalc_importer): + last_index = jalc_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/jalc_lod_sample.xml', 'r') as f: + jalc_importer.bezerk_mode = True + counts = Bs4XmlFilePusher(jalc_importer, f, "Description").run() + assert counts['insert'] == 2 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = jalc_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "jalc" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.JalcImporter" in eg.extra['agent'] + + last_index = jalc_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/jalc_lod_sample.xml', 'r') as f: + jalc_importer.bezerk_mode = False + jalc_importer.reset() + counts = Bs4XmlFilePusher(jalc_importer, f, "Description").run() + assert counts['insert'] == 0 + assert counts['exists'] == 2 + assert counts['skip'] == 0 + assert last_index == jalc_importer.api.get_changelog(limit=1)[0].index + +def test_jalc_dict_parse(jalc_importer): + with open('tests/files/jalc_lod_sample.xml', 'r') as f: + soup = BeautifulSoup(f, "xml") + r = jalc_importer.parse_record(soup.find_all("Description")[0]) + + print(r.extra) + assert r.title == "New carbides in the Ni-Ti-Mo-C system" + assert r.subtitle == None + assert r.original_title == "Ni-Ti-Mo-C系に出現する新炭化物相について" + assert r.publisher == "Japan Society of Powder and Powder Metallurgy" + assert r.release_type == "article-journal" + assert r.release_stage == "published" + assert r.license_slug == None + assert r.ext_ids.doi == "10.2497/jjspm.36.898" + assert r.language == "ja" + assert r.volume == "36" + assert r.issue == "8" + assert r.pages == "898-902" + assert r.release_year == 1989 + # XXX: + #assert 'subtitle' not in r.extra + #assert 'subtitle' not in r.extra['jalc'] + #assert 'funder' not in r.extra + #assert 'funder' not in r.extra['jalc'] + # matched by ISSN, so shouldn't be in there? + #assert extra['container_name'] == "International Journal of Quantum Chemistry" + assert len(r.contribs) == 4 + + assert r.contribs[0].raw_name == "Hashimoto Yasuhiko" + assert r.contribs[0].given_name == "Yasuhiko" + assert r.contribs[0].surname == "Hashimoto" + assert r.contribs[0].extra['original_name']['raw_name'] == "橋本 雍彦" + assert r.contribs[0].extra['original_name']['given_name'] == "雍彦" + assert r.contribs[0].extra['original_name']['surname'] == "橋本" + + assert r.contribs[3].raw_name == "Takahashi Teruo" + assert r.contribs[3].given_name == "Teruo" + assert r.contribs[3].surname == "Takahashi" + assert r.contribs[3].extra['original_name']['raw_name'] == "高橋 輝男" + assert r.contribs[3].extra['original_name']['given_name'] == "輝男" + assert r.contribs[3].extra['original_name']['surname'] == "高橋" + + assert not r.refs |