diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-15 15:06:35 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 |
commit | 82ea0b02b29d6be0542eb43f00710a23ed8516c2 (patch) | |
tree | 5be411a5c9a4880c36e2b7e764e1954e84076abb | |
parent | 82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a (diff) | |
download | fatcat-82ea0b02b29d6be0542eb43f00710a23ed8516c2.tar.gz fatcat-82ea0b02b29d6be0542eb43f00710a23ed8516c2.zip |
initial JSTOR importer
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/jstor.py (renamed from python/parse_jstor_xml.py) | 170 |
2 files changed, 132 insertions, 39 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index ecbfe38e..497946ea 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -15,6 +15,7 @@ To run an import you combine two classes; one each of: from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP from .jalc import JalcImporter +from .jstor import JstorImporter from .grobid_metadata import GrobidMetadataImporter from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter diff --git a/python/parse_jstor_xml.py b/python/fatcat_tools/importers/jstor.py index 04f2b18e..6ab320c3 100644 --- a/python/parse_jstor_xml.py +++ b/python/fatcat_tools/importers/jstor.py @@ -1,31 +1,56 @@ + import sys import json +import sqlite3 import datetime +import itertools +import subprocess from bs4 import BeautifulSoup -from bs4.element import NavigableString +import fatcat_client +from .common import EntityImporter, clean + +# is this just ISO 3-char to ISO 2-char? +# XXX: more entries +JSTOR_LANG_MAP = { + 'eng': 'en', +} -class JstorXmlParser(): +# XXX: more entries +JSTOR_CONTRIB_MAP = { + 'author': 'author', + 'editor': 'editor', + 'translator': 'translator', + 'illustrator': 'illustrator', +} + +class JstorImporter(EntityImporter): """ - Converts JSTOR bulk XML metadata (eg, from their Early Journals Collection) + Importer for JSTOR bulk XML metadata (eg, from their Early Journals + Collection) """ - def __init__(self): - pass + def __init__(self, api, issn_map_file, **kwargs): - def parse_file(self, handle): + eg_desc = kwargs.get('editgroup_description', + "Automated import of JSTOR XML metadata") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JstorImporter') + super().__init__(api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) - # 1. open with beautiful soup - soup = BeautifulSoup(handle, "xml") + self.create_containers = kwargs.get('create_containers') - # 2. iterate over articles, call parse_article on each - for article in soup.find_all("article"): - resp = self.parse_article(article) - print(json.dumps(resp)) - #sys.exit(-1) + self.read_issn_map_file(issn_map_file) - def parse_article(self, article): + def want(self, obj): + return True + + def parse_record(self, article): journal_meta = article.front.find("journal-meta") article_meta = article.front.find("article-meta") @@ -42,16 +67,22 @@ class JstorXmlParser(): issn = "{}-{}".format(issn[0:4], issn[4:8]) else: assert len(issn) == 9 + # XXX: + container_id = None container = dict( name=journal_title, publisher=publisher, issn=issn, # TODO: ISSN-L lookup... ) - doi = article_meta.find("article-id", attr={"pub-id-type": "doi"}) + doi = article_meta.find("article-id", {"pub-id-type": "doi"}) if doi: doi = doi.string.lower().strip() + jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"}) + if jstor_id: + jstor_id = jstor_id.string + title = article_meta.find("article-title") if title: title = title.string.strip() @@ -70,9 +101,11 @@ class JstorXmlParser(): name = surname.string else: name = None - contribs.append(dict( - role=c['contrib-type'], # XXX: types? mapping? - raw_name=name, + contribs.append(fatcat_client.ReleaseContrib( + role=JSTOR_CONTRIB_MAP.get(c['contrib-type']), + raw_name=clean(name), + given_name=clean(given.string), + surname=clean(surname.string), )) release_year = None @@ -104,9 +137,11 @@ class JstorXmlParser(): cm = article_meta.find("custom-meta") if cm.find("meta-name").string == "lang": language = cm.find("meta-value").string + language = JSTOR_LANG_MAP.get(language) release_type = "article-journal" if "[Abstract]" in title: + # TODO: strip the "[Abstract]" bit? release_type = "abstract" elif "[Editorial" in title: release_type = "editorial" @@ -119,38 +154,53 @@ class JstorXmlParser(): # strip brackets if that is all that is there (eg, translation or non-english) title = title[1:-1] + # JSTOR issue-id + if article_meta.find('issue-id'): + issue_id = clean(article_meta.find('issue-id').string) + if issue_id: + extra_jstor['issue_id'] = issue_id + + # JSTOR journal-id + # XXX: + # everything in JSTOR is published - release_status = "published" + release_stage = "published" + # extra: + # withdrawn_date + # translation_of + # subtitle + # aliases + # container_name + # group-title + # pubmed: retraction refs if extra_jstor: extra['jstor'] = extra_jstor if not extra: extra = None - re = dict( - issn=issn, # not an entity field + re = fatcat_client.ReleaseEntity( #work_id title=title, #original_title release_type=release_type, - release_status=release_status, + release_stage=release_stage, release_date=release_date.isoformat(), release_year=release_year, - doi=doi, - #pmid - #pmcid - #isbn13 # TODO: ? + ext_ids=fatcat_client.ReleaseExtIds( + doi=doi, + jstor=jstor_id, + ), volume=volume, issue=issue, pages=pages, publisher=publisher, language=language, - #license_slug # TODO: ? + #license_slug # content, mimetype, lang #abstracts=abstracts, - # raw_name, role, raw_affiliation, extra contribs=contribs, # key, year, container_name, title, locator @@ -159,20 +209,62 @@ class JstorXmlParser(): # name, type, publisher, issnl # extra: issnp, issne, original_name, languages, country - container=container, - - # extra: - # withdrawn_date - # translation_of - # subtitle - # aliases - # container_name - # group-title - # pubmed: retraction refs + container_id=container_id, + extra=extra, ) return re + def try_update(self, re): + + # first, lookup existing by JSTOR id (which much be defined) + existing = None + try: + existing = self.api.lookup_release(jstor=re.ext_ids.jstor) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + # then try DOI lookup if there is one + if not existing and re.ext_ids.doi: + try: + existing = self.api.lookup_release(jstor=re.ext_ids.jstor) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + if existing and existing.ext_ids.jstor: + # don't update if it already has JSTOR ID + self.counts['exists'] += 1 + return False + elif existing: + # but do update if only DOI was set + existing.ext_ids.jstor = re.jstor_id + existing.extra['jstor'] = re.extra['jstor'] + self.api.update_release(self.get_editgroup_id(), existing.ident, existing) + self.counts['update'] += 1 + return False + + return True + + def insert_batch(self, batch): + self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch( + editgroup=fatcat_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + + def parse_file(self, handle): + + # 1. open with beautiful soup + soup = BeautifulSoup(handle, "xml") + + # 2. iterate over articles, call parse_article on each + for article in soup.find_all("article"): + resp = self.parse_record(article) + print(json.dumps(resp)) + #sys.exit(-1) + if __name__=='__main__': - parser = JstorXmlParser() + parser = JstorImporter() parser.parse_file(open(sys.argv[1])) |