aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-15 12:02:55 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:41:29 -0700
commit82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a (patch)
tree930331468462a74873aebb44b88c051e8b096c4f /python
parent4cff530fa3a49e845a2c21bbc85d74a92a3e2b06 (diff)
downloadfatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.tar.gz
fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.zip
initial flesh out of JALC parser
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/importers/__init__.py3
-rw-r--r--python/fatcat_tools/importers/common.py36
-rw-r--r--python/fatcat_tools/importers/jalc.py310
-rw-r--r--python/parse_jalc_xml.py209
-rw-r--r--python/tests/import_jalc.py88
5 files changed, 436 insertions, 210 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index f5ff43e5..ecbfe38e 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -12,8 +12,9 @@ To run an import you combine two classes; one each of:
"""
-from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, KafkaJsonPusher, make_kafka_consumer, clean
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk
from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
+from .jalc import JalcImporter
from .grobid_metadata import GrobidMetadataImporter
from .journal_metadata import JournalMetadataImporter
from .matched import MatchedImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 282f775c..7fca38cf 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -7,13 +7,16 @@ import ftfy
import sqlite3
import itertools
import subprocess
+import unicodedata
from collections import Counter
import pykafka
+from bs4 import BeautifulSoup
import fatcat_client
from fatcat_client.rest import ApiException
+DATE_FMT = "%Y-%m-%d"
SANE_MAX_RELEASES = 200
SANE_MAX_URLS = 100
@@ -52,6 +55,23 @@ def test_clean():
assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+def is_cjk(s):
+ if not s:
+ return False
+ return unicodedata.name(s[0]).startswith("CJK")
+
+def test_is_cjk():
+ assert is_cjk(None) == False
+ assert is_cjk('') == False
+ assert is_cjk('blah') == False
+ assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True
+ assert is_cjk('菊') == True
+ assert is_cjk('ひヒ') == True
+ assert is_cjk('english with ひヒ') == True
+ assert is_cjk('き゚ゅ') == True
+ assert is_cjk('水道') == True
+ assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True
+
DOMAIN_REL_MAP = {
"archive.org": "archive",
# LOCKSS, Portico, DuraSpace, etc would also be "archive"
@@ -456,6 +476,22 @@ class SqlitePusher(RecordPusher):
return counts
+class Bs4XmlFilePusher(RecordPusher):
+
+ def __init__(self, importer, xml_file, record_tag, **kwargs):
+ self.importer = importer
+ self.xml_file = xml_file
+ self.record_tag = record_tag
+
+ def run(self):
+ soup = BeautifulSoup(self.xml_file, "xml")
+ for record in soup.find_all(self.record_tag):
+ self.importer.push_record(record)
+ counts = self.importer.finish()
+ print(counts)
+ return counts
+
+
class KafkaJsonPusher(RecordPusher):
def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
new file mode 100644
index 00000000..d7b89727
--- /dev/null
+++ b/python/fatcat_tools/importers/jalc.py
@@ -0,0 +1,310 @@
+
+import sys
+import json
+import sqlite3
+import datetime
+import itertools
+import subprocess
+from bs4 import BeautifulSoup
+
+import fatcat_client
+from .common import EntityImporter, clean, is_cjk, DATE_FMT
+
+
+class JalcImporter(EntityImporter):
+ """
+ Importer for JALC DOI metadata.
+
+ NOTE: some JALC DOIs seem to get cross-registered with Crossref
+ """
+
+ def __init__(self, api, issn_map_file, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of JALC DOI metadata")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JalcImporter')
+ super().__init__(api,
+ issn_map_file=issn_map_file,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+
+ self.create_containers = kwargs.get('create_containers')
+ extid_map_file = kwargs.get('extid_map_file')
+ self.extid_map_db = None
+ if extid_map_file:
+ db_uri = "file:{}?mode=ro".format(extid_map_file)
+ print("Using external ID map: {}".format(db_uri))
+ self.extid_map_db = sqlite3.connect(db_uri, uri=True)
+ else:
+ print("Not using external ID map")
+
+ self.read_issn_map_file(issn_map_file)
+
+ def lookup_ext_ids(self, doi):
+ if self.extid_map_db is None:
+ return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+ row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
+ [doi.lower()]).fetchone()
+ if row is None:
+ return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+ row = [str(cell or '') or None for cell in row]
+ return dict(
+ core_id=row[0],
+ pmid=row[1],
+ pmcid=row[2],
+ wikidata_qid=row[3],
+ # TODO:
+ arxiv_id=None,
+ jstor_id=None,
+ )
+
+ def want(self, obj):
+ return True
+
+ def parse_record(self, record):
+ """
+ record is a beautiful soup object
+ returns a ReleaseEntity, or None
+
+ In JALC metadata, both English and Japanese records are given for most
+ fields.
+ """
+
+ extra = dict()
+ extra_jalc = dict()
+
+ titles = record.find_all("title")
+ title = titles[0].string.strip()
+ original_title = None
+ if title.endswith('.'):
+ title = title[:-1]
+ if len(titles) > 1:
+ original_title = titles[1].string.strip()
+ if original_title.endswith('.'):
+ original_title = original_title[:-1]
+
+ doi = None
+ if record.doi:
+ doi = record.doi.string.lower().strip()
+ assert doi.startswith('10.')
+
+ contribs = []
+ people = record.find_all("Person")
+ if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string):
+ # both english and japanese names are included for every author
+ for i in range(int(len(people)/2)):
+ eng = people[i*2]
+ jpn = people[i*2 + 1]
+ # there isn't always an english name though? TODO
+ name = eng
+ if not name.find('name'):
+ name = jpn
+ contrib = fatcat_client.ReleaseContrib(
+ raw_name=clean(name.find('name').string),
+ given_name=clean(name.find('givenName').string),
+ surname=clean(name.find('familyName').string),
+ role='author',
+ )
+ if eng.find('name') and jpn.find('name'):
+ contrib.extra = {
+ 'original_name': {
+ 'lang': 'ja',
+ 'raw_name': clean(jpn.find('name').string),
+ 'given_name': clean(jpn.find('givenName').string),
+ 'surname': clean(jpn.find('familyName').string),
+ }}
+ contribs.append(contrib)
+ elif people:
+ # TODO: test for this codepath?
+ for eng in people:
+ contrib = dict(
+ raw_name=clean(eng.find('name').string),
+ given_name=clean(eng.find('givenName').string),
+ surname=clean(eng.find('familyName').string),
+ role='author',
+ )
+ contribs.append(contrib)
+
+ release_year = None
+ release_date = None
+ date = record.date or None
+ if date:
+ date = date.string
+ if len(date) is 10:
+ release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date()
+ release_year = release_date.year
+ release_date = release_date.isoformat()
+ elif len(date) is 4:
+ release_year = int(date)
+
+ pages = None
+ if record.startingPage:
+ pages = record.startingPage.string
+ if record.endingPage:
+ pages = "{}-{}".format(pages, record.endingPage.string)
+ volume = None
+ if record.volume:
+ volume = record.volume.string
+ issue = None
+ if record.number:
+ # note: number/issue transform
+ issue = record.number.string
+
+ # container
+ issn = None
+ issn_list = record.find_all("issn")
+ if issn_list:
+ # if we wanted the other ISSNs, would also need to uniq the list.
+ # But we only need one to lookup ISSN-L/container
+ issn = issn_list[0].string
+ issnl = self.issn2issnl(issn)
+ container_id = None
+ if issnl:
+ container_id = self.lookup_issnl(issnl)
+
+ publisher = None
+ container_name = None
+ container_extra = dict()
+
+ if record.publicationName:
+ pubs = [p.string.strip() for p in record.find_all("publicationName")]
+ pubs = [p for p in pubs if p]
+ assert(pubs)
+ if len(pubs) > 1 and pubs[0] == pubs[1]:
+ pubs = [pubs[0]]
+ elif len(pubs) > 1 and is_cjk(pubs[0]):
+ # ordering is not reliable
+ pubs = [pubs[1], pubs[0]]
+ container_name = clean(pubs[0])
+ if len(pubs) > 1:
+ orig_container_name = pubs[1]
+ container_extra['original_name'] = clean(pubs[1])
+
+ if record.publisher:
+ pubs = [p.string.strip() for p in record.find_all("publisher")]
+ pubs = [p for p in pubs if p]
+ if len(pubs) > 1 and pubs[0] == pubs[1]:
+ pubs = [pubs[0]]
+ elif len(pubs) > 1 and is_cjk(pubs[0]):
+ # ordering is not reliable
+ pubs = [pubs[1], pubs[0]]
+ publisher = clean(pubs[0])
+ if len(pubs) > 1:
+ container_extra['publisher_alt_name'] = pubs[1]
+
+ if (container_id is None and self.create_containers and (issnl is not None)
+ and container_name):
+ # name, type, publisher, issnl
+ # extra: issnp, issne, original_name, languages, country
+ container_extra['country'] = 'jp'
+ container_extra['languages'] = ['ja']
+ ce = fatcat_client.ContainerEntity(
+ name=container_name,
+ container_type='journal',
+ publisher=publisher,
+ issnl=issnl,
+ extra=(container_extra or None))
+ ce_edit = self.create_container(ce)
+ container_id = ce_edit.ident
+
+ # the vast majority of works are in japanese
+ # TODO: any indication when *not* in japanese?
+ lang = "ja"
+
+ # reasonable default for this collection
+ release_type = "article-journal"
+
+ # external identifiers
+ extids = self.lookup_ext_ids(doi=doi)
+
+ # extra:
+ # translation_of
+ # aliases
+ # container_name
+ # group-title
+ # always put at least an empty dict here to indicate the DOI registrar
+ # (informally)
+ extra['jalc'] = extra_jalc
+
+ re = fatcat_client.ReleaseEntity(
+ work_id=None,
+ title=title,
+ original_title=original_title,
+ release_type="article-journal",
+ release_stage='published',
+ release_date=release_date,
+ release_year=release_year,
+ ext_ids=fatcat_client.ReleaseExtIds(
+ doi=doi,
+ pmid=extids['pmid'],
+ pmcid=extids['pmcid'],
+ wikidata_qid=extids['wikidata_qid'],
+ core=extids['core_id'],
+ arxiv=extids['arxiv_id'],
+ jstor=extids['jstor_id'],
+ ),
+ volume=volume,
+ issue=issue,
+ pages=pages,
+ publisher=publisher,
+ language=lang,
+ #license_slug
+
+ # content, mimetype, lang
+ #abstracts=abstracts,
+
+ # raw_name, role, raw_affiliation, extra
+ contribs=contribs,
+
+
+ extra=extra,
+ )
+ return re
+
+ def try_update(self, re):
+
+ # lookup existing DOI
+ existing = None
+ try:
+ existing = self.api.lookup_release(doi=re.ext_ids.doi)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ # doesn't exist, need to insert
+ return True
+
+ # eventually we'll want to support "updates", but for now just skip if
+ # entity already exists
+ if existing:
+ self.counts['exists'] += 1
+ return False
+ return False
+
+ def insert_batch(self, batch):
+ self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch(
+ editgroup=fatcat_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
+
+ def parse_file(self, handle):
+ """
+ Helper for testing; can run this file stand-alone instead of using a pusher
+ """
+
+ # 1. open with beautiful soup
+ soup = BeautifulSoup(handle, "xml")
+
+ # 2. iterate over articles, call parse_article on each
+ for record in soup.find_all("Description"):
+ resp = self.parse_record(record)
+ #print(json.dumps(resp))
+ print(resp)
+ #sys.exit(-1)
+
+
+if __name__=='__main__':
+ parser = JalcXmlParser()
+ parser.parse_file(open(sys.argv[1]))
diff --git a/python/parse_jalc_xml.py b/python/parse_jalc_xml.py
deleted file mode 100644
index d7817df9..00000000
--- a/python/parse_jalc_xml.py
+++ /dev/null
@@ -1,209 +0,0 @@
-
-import sys
-import json
-import datetime
-import unicodedata
-from bs4 import BeautifulSoup
-from bs4.element import NavigableString
-
-
-DATE_FMT = "%Y-%m-%d"
-
-def is_cjk(s):
- if not s:
- return False
- return unicodedata.name(s[0]).startswith("CJK")
-
-class JalcXmlParser():
- """
- Converts JALC DOI metadata (in XML/RDF format) to fatcat release entity
-
- NOTE: some JALC DOIs seem to get cross-registered with Crossref
- """
-
- def __init__(self):
- pass
-
- def parse_file(self, handle):
-
- # 1. open with beautiful soup
- soup = BeautifulSoup(handle, "xml")
-
- # 2. iterate over articles, call parse_article on each
- for record in soup.find_all("Description"):
- resp = self.parse_record(record)
- print(json.dumps(resp))
- #sys.exit(-1)
-
-
- def parse_record(self, record):
- """
- In JALC metadata, both English and Japanese records are given for most
- fields.
- """
-
- #extra = dict()
- #extra_jalc = dict()
-
- titles = record.find_all("title")
- title = titles[0].string.strip()
- original_title = None
- if title.endswith('.'):
- title = title[:-1]
- if len(titles) > 1:
- original_title = titles[1].string.strip()
- if original_title.endswith('.'):
- original_title = original_title[:-1]
-
- doi = None
- if record.doi:
- doi = record.doi.string.lower().strip()
- assert doi.startswith('10.')
-
- contribs = []
- people = record.find_all("Person")
- if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string):
- # both english and japanese names are included
- for i in range(int(len(people)/2)):
- # both english and japanese names are included for every author
- eng = people[i*2]
- jpn = people[i*2 + 1]
- raw_name = eng.find('name')
- orig_name = jpn.find('name')
- if not raw_name:
- raw_name = orig_name
- contrib = dict(
- raw_name=raw_name.string,
- role='author',
- )
- if raw_name and orig_name:
- contrib['extra'] = dict(original_name=orig_name.string)
- contribs.append(contrib)
- elif people:
- for eng in people:
- raw_name = eng.find('name')
- contrib = dict(
- raw_name=eng.find('name').string,
- role='author',
- )
- contribs.append(contrib)
-
- release_year = None
- release_date = None
- date = record.date or None
- if date:
- date = date.string
- if len(date) is 10:
- release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date()
- release_year = release_date.year
- release_date = release_date.isoformat()
- elif len(date) is 4:
- release_year = int(date)
-
- pages = None
- if record.startingPage:
- pages = record.startingPage.string
- if record.endingPage:
- pages = "{}-{}".format(pages, record.endingPage.string)
- volume = None
- if record.volume:
- volume = record.volume.string
- issue = None
- if record.number:
- # note: number/issue transform
- issue = record.number.string
-
- issn = None
- issn_list = record.find_all("issn")
- if issn_list:
- # if we wanted the other ISSNs, would also need to uniq the list.
- # But we only need one to lookup ISSN-L/container
- issn = issn_list[0].string
-
- container = dict()
- container_extra = dict()
- container_name = None
- if record.publicationName:
- pubs = [p.string.strip() for p in record.find_all("publicationName")]
- pubs = [p for p in pubs if p]
- assert(pubs)
- if len(pubs) > 1 and pubs[0] == pubs[1]:
- pubs = [pubs[0]]
- elif len(pubs) > 1 and is_cjk(pubs[0]):
- # ordering is not reliable
- pubs = [pubs[1], pubs[0]]
- container_name = pubs[0]
- container['name'] = container_name
- if len(pubs) > 1:
- orig_container_name = pubs[1]
- container_extra['original_name'] = pubs[1]
- publisher = None
- if record.publisher:
- pubs = [p.string.strip() for p in record.find_all("publisher")]
- pubs = [p for p in pubs if p]
- if len(pubs) > 1 and pubs[0] == pubs[1]:
- pubs = [pubs[0]]
- elif len(pubs) > 1 and is_cjk(pubs[0]):
- # ordering is not reliable
- pubs = [pubs[1], pubs[0]]
- publisher = pubs[0]
- container['publisher'] = publisher
- if len(pubs) > 1:
- container_extra['publisher_alt_name'] = pubs[1]
- if container_extra:
- container['extra'] = container_extra
- if not container:
- container = None
-
- # the vast majority of works are in japanese
- # TODO: any indication when *not* in japanese?
- lang = "ja"
-
- # reasonable default for this collection
- release_type = "article-journal"
-
- re = dict(
- work_id=None,
- title=title,
- original_title=original_title,
- release_type="article-journal",
- release_status='submitted', # XXX: source_type?
- release_date=release_date,
- release_year=release_year,
- #arxiv_id
- doi=doi,
- #pmid
- #pmcid
- #isbn13 # never in Article
- volume=volume,
- issue=issue,
- pages=pages,
- publisher=publisher,
- language=lang,
- #license_slug # not in MEDLINE
-
- # content, mimetype, lang
- #abstracts=abstracts,
-
- # raw_name, role, raw_affiliation, extra
- contribs=contribs,
-
- # name, type, publisher, issnl
- # extra: issnp, issne, original_name, languages, country
- container=container,
-
- # extra:
- # withdrawn_date
- # translation_of
- # subtitle
- # aliases
- # container_name
- # group-title
- # pubmed: retraction refs
- #extra=extra,
- )
- return re
-
-if __name__=='__main__':
- parser = JalcXmlParser()
- parser.parse_file(open(sys.argv[1]))
diff --git a/python/tests/import_jalc.py b/python/tests/import_jalc.py
new file mode 100644
index 00000000..7b25f0fa
--- /dev/null
+++ b/python/tests/import_jalc.py
@@ -0,0 +1,88 @@
+
+import json, gzip
+import pytest
+from fatcat_tools.importers import JalcImporter, Bs4XmlFilePusher
+from fixtures import api
+from bs4 import BeautifulSoup
+
+
+@pytest.fixture(scope="function")
+def jalc_importer(api):
+ with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+ yield JalcImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True)
+
+@pytest.fixture(scope="function")
+def jalc_importer_existing(api):
+ with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+ yield JalcImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False)
+
+def test_jalc_importer(jalc_importer):
+ last_index = jalc_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/jalc_lod_sample.xml', 'r') as f:
+ jalc_importer.bezerk_mode = True
+ counts = Bs4XmlFilePusher(jalc_importer, f, "Description").run()
+ assert counts['insert'] == 2
+ assert counts['exists'] == 0
+ assert counts['skip'] == 0
+
+ # fetch most recent editgroup
+ change = jalc_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
+ assert eg.description
+ assert "jalc" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.JalcImporter" in eg.extra['agent']
+
+ last_index = jalc_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/jalc_lod_sample.xml', 'r') as f:
+ jalc_importer.bezerk_mode = False
+ jalc_importer.reset()
+ counts = Bs4XmlFilePusher(jalc_importer, f, "Description").run()
+ assert counts['insert'] == 0
+ assert counts['exists'] == 2
+ assert counts['skip'] == 0
+ assert last_index == jalc_importer.api.get_changelog(limit=1)[0].index
+
+def test_jalc_dict_parse(jalc_importer):
+ with open('tests/files/jalc_lod_sample.xml', 'r') as f:
+ soup = BeautifulSoup(f, "xml")
+ r = jalc_importer.parse_record(soup.find_all("Description")[0])
+
+ print(r.extra)
+ assert r.title == "New carbides in the Ni-Ti-Mo-C system"
+ assert r.subtitle == None
+ assert r.original_title == "Ni-Ti-Mo-C系に出現する新炭化物相について"
+ assert r.publisher == "Japan Society of Powder and Powder Metallurgy"
+ assert r.release_type == "article-journal"
+ assert r.release_stage == "published"
+ assert r.license_slug == None
+ assert r.ext_ids.doi == "10.2497/jjspm.36.898"
+ assert r.language == "ja"
+ assert r.volume == "36"
+ assert r.issue == "8"
+ assert r.pages == "898-902"
+ assert r.release_year == 1989
+ # XXX:
+ #assert 'subtitle' not in r.extra
+ #assert 'subtitle' not in r.extra['jalc']
+ #assert 'funder' not in r.extra
+ #assert 'funder' not in r.extra['jalc']
+ # matched by ISSN, so shouldn't be in there?
+ #assert extra['container_name'] == "International Journal of Quantum Chemistry"
+ assert len(r.contribs) == 4
+
+ assert r.contribs[0].raw_name == "Hashimoto Yasuhiko"
+ assert r.contribs[0].given_name == "Yasuhiko"
+ assert r.contribs[0].surname == "Hashimoto"
+ assert r.contribs[0].extra['original_name']['raw_name'] == "橋本 雍彦"
+ assert r.contribs[0].extra['original_name']['given_name'] == "雍彦"
+ assert r.contribs[0].extra['original_name']['surname'] == "橋本"
+
+ assert r.contribs[3].raw_name == "Takahashi Teruo"
+ assert r.contribs[3].given_name == "Teruo"
+ assert r.contribs[3].surname == "Takahashi"
+ assert r.contribs[3].extra['original_name']['raw_name'] == "高橋 輝男"
+ assert r.contribs[3].extra['original_name']['given_name'] == "輝男"
+ assert r.contribs[3].extra['original_name']['surname'] == "高橋"
+
+ assert not r.refs