From 82ea0b02b29d6be0542eb43f00710a23ed8516c2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 15 May 2019 15:06:35 -0700 Subject: initial JSTOR importer --- python/fatcat_tools/importers/__init__.py | 1 + python/fatcat_tools/importers/jstor.py | 270 ++++++++++++++++++++++++++++++ python/parse_jstor_xml.py | 178 -------------------- 3 files changed, 271 insertions(+), 178 deletions(-) create mode 100644 python/fatcat_tools/importers/jstor.py delete mode 100644 python/parse_jstor_xml.py (limited to 'python') diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index ecbfe38e..497946ea 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -15,6 +15,7 @@ To run an import you combine two classes; one each of: from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP from .jalc import JalcImporter +from .jstor import JstorImporter from .grobid_metadata import GrobidMetadataImporter from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py new file mode 100644 index 00000000..6ab320c3 --- /dev/null +++ b/python/fatcat_tools/importers/jstor.py @@ -0,0 +1,270 @@ + + +import sys +import json +import sqlite3 +import datetime +import itertools +import subprocess +from bs4 import BeautifulSoup + +import fatcat_client +from .common import EntityImporter, clean + +# is this just ISO 3-char to ISO 2-char? +# XXX: more entries +JSTOR_LANG_MAP = { + 'eng': 'en', +} + +# XXX: more entries +JSTOR_CONTRIB_MAP = { + 'author': 'author', + 'editor': 'editor', + 'translator': 'translator', + 'illustrator': 'illustrator', +} + +class JstorImporter(EntityImporter): + """ + Importer for JSTOR bulk XML metadata (eg, from their Early Journals + Collection) + """ + + def __init__(self, api, issn_map_file, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of JSTOR XML metadata") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JstorImporter') + super().__init__(api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + self.create_containers = kwargs.get('create_containers') + + self.read_issn_map_file(issn_map_file) + + def want(self, obj): + return True + + def parse_record(self, article): + + journal_meta = article.front.find("journal-meta") + article_meta = article.front.find("article-meta") + + extra = dict() + extra_jstor = dict() + + journal_title = journal_meta.find("journal-title").string + publisher = journal_meta.find("publisher-name").string + issn = journal_meta.find("issn") + if issn: + issn = issn.string + if len(issn) == 8: + issn = "{}-{}".format(issn[0:4], issn[4:8]) + else: + assert len(issn) == 9 + # XXX: + container_id = None + container = dict( + name=journal_title, + publisher=publisher, + issn=issn, # TODO: ISSN-L lookup... + ) + + doi = article_meta.find("article-id", {"pub-id-type": "doi"}) + if doi: + doi = doi.string.lower().strip() + + jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"}) + if jstor_id: + jstor_id = jstor_id.string + + title = article_meta.find("article-title") + if title: + title = title.string.strip() + if title.endswith('.'): + title = title[:-1] + + contribs = [] + cgroup = article_meta.find("contrib-group") + if cgroup: + for c in cgroup.find_all("contrib"): + given = c.find("given-names") + surname = c.find("surname") + if given and surname: + name = "{} {}".format(given.string, surname.string) + elif surname: + name = surname.string + else: + name = None + contribs.append(fatcat_client.ReleaseContrib( + role=JSTOR_CONTRIB_MAP.get(c['contrib-type']), + raw_name=clean(name), + given_name=clean(given.string), + surname=clean(surname.string), + )) + + release_year = None + release_date = None + pub_date = article_meta.find('pub-date') + if pub_date and pub_date.year: + release_year = int(pub_date.year.string) + if pub_date.month and pub_date.day: + release_date = datetime.date( + release_year, + int(pub_date.month.string), + int(pub_date.day.string)) + + volume = None + if article_meta.volume: + volume = article_meta.volume.string or None + + issue = None + if article_meta.issue: + issue = article_meta.issue.string or None + + pages = None + if article_meta.find("page-range"): + pages = article_meta.find("page-range").string + elif article_meta.fpage: + pages = article_meta.fpage.string + + language = None + cm = article_meta.find("custom-meta") + if cm.find("meta-name").string == "lang": + language = cm.find("meta-value").string + language = JSTOR_LANG_MAP.get(language) + + release_type = "article-journal" + if "[Abstract]" in title: + # TODO: strip the "[Abstract]" bit? + release_type = "abstract" + elif "[Editorial" in title: + release_type = "editorial" + elif "[Letter" in title: + release_type = "letter" + elif "[Poem" in title or "[Photograph" in title: + release_type = None + + if title.startswith("[") and title.endswith("]"): + # strip brackets if that is all that is there (eg, translation or non-english) + title = title[1:-1] + + # JSTOR issue-id + if article_meta.find('issue-id'): + issue_id = clean(article_meta.find('issue-id').string) + if issue_id: + extra_jstor['issue_id'] = issue_id + + # JSTOR journal-id + # XXX: + + # everything in JSTOR is published + release_stage = "published" + + # extra: + # withdrawn_date + # translation_of + # subtitle + # aliases + # container_name + # group-title + # pubmed: retraction refs + if extra_jstor: + extra['jstor'] = extra_jstor + if not extra: + extra = None + + re = fatcat_client.ReleaseEntity( + #work_id + title=title, + #original_title + release_type=release_type, + release_stage=release_stage, + release_date=release_date.isoformat(), + release_year=release_year, + ext_ids=fatcat_client.ReleaseExtIds( + doi=doi, + jstor=jstor_id, + ), + volume=volume, + issue=issue, + pages=pages, + publisher=publisher, + language=language, + #license_slug + + # content, mimetype, lang + #abstracts=abstracts, + + contribs=contribs, + + # key, year, container_name, title, locator + # extra: volume, authors, issue, publisher, identifiers + #refs=refs, + + # name, type, publisher, issnl + # extra: issnp, issne, original_name, languages, country + container_id=container_id, + + extra=extra, + ) + return re + + def try_update(self, re): + + # first, lookup existing by JSTOR id (which much be defined) + existing = None + try: + existing = self.api.lookup_release(jstor=re.ext_ids.jstor) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + # then try DOI lookup if there is one + if not existing and re.ext_ids.doi: + try: + existing = self.api.lookup_release(jstor=re.ext_ids.jstor) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + if existing and existing.ext_ids.jstor: + # don't update if it already has JSTOR ID + self.counts['exists'] += 1 + return False + elif existing: + # but do update if only DOI was set + existing.ext_ids.jstor = re.jstor_id + existing.extra['jstor'] = re.extra['jstor'] + self.api.update_release(self.get_editgroup_id(), existing.ident, existing) + self.counts['update'] += 1 + return False + + return True + + def insert_batch(self, batch): + self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch( + editgroup=fatcat_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + + def parse_file(self, handle): + + # 1. open with beautiful soup + soup = BeautifulSoup(handle, "xml") + + # 2. iterate over articles, call parse_article on each + for article in soup.find_all("article"): + resp = self.parse_record(article) + print(json.dumps(resp)) + #sys.exit(-1) + +if __name__=='__main__': + parser = JstorImporter() + parser.parse_file(open(sys.argv[1])) diff --git a/python/parse_jstor_xml.py b/python/parse_jstor_xml.py deleted file mode 100644 index 04f2b18e..00000000 --- a/python/parse_jstor_xml.py +++ /dev/null @@ -1,178 +0,0 @@ - -import sys -import json -import datetime -from bs4 import BeautifulSoup -from bs4.element import NavigableString - - -class JstorXmlParser(): - """ - Converts JSTOR bulk XML metadata (eg, from their Early Journals Collection) - """ - - def __init__(self): - pass - - def parse_file(self, handle): - - # 1. open with beautiful soup - soup = BeautifulSoup(handle, "xml") - - # 2. iterate over articles, call parse_article on each - for article in soup.find_all("article"): - resp = self.parse_article(article) - print(json.dumps(resp)) - #sys.exit(-1) - - def parse_article(self, article): - - journal_meta = article.front.find("journal-meta") - article_meta = article.front.find("article-meta") - - extra = dict() - extra_jstor = dict() - - journal_title = journal_meta.find("journal-title").string - publisher = journal_meta.find("publisher-name").string - issn = journal_meta.find("issn") - if issn: - issn = issn.string - if len(issn) == 8: - issn = "{}-{}".format(issn[0:4], issn[4:8]) - else: - assert len(issn) == 9 - container = dict( - name=journal_title, - publisher=publisher, - issn=issn, # TODO: ISSN-L lookup... - ) - - doi = article_meta.find("article-id", attr={"pub-id-type": "doi"}) - if doi: - doi = doi.string.lower().strip() - - title = article_meta.find("article-title") - if title: - title = title.string.strip() - if title.endswith('.'): - title = title[:-1] - - contribs = [] - cgroup = article_meta.find("contrib-group") - if cgroup: - for c in cgroup.find_all("contrib"): - given = c.find("given-names") - surname = c.find("surname") - if given and surname: - name = "{} {}".format(given.string, surname.string) - elif surname: - name = surname.string - else: - name = None - contribs.append(dict( - role=c['contrib-type'], # XXX: types? mapping? - raw_name=name, - )) - - release_year = None - release_date = None - pub_date = article_meta.find('pub-date') - if pub_date and pub_date.year: - release_year = int(pub_date.year.string) - if pub_date.month and pub_date.day: - release_date = datetime.date( - release_year, - int(pub_date.month.string), - int(pub_date.day.string)) - - volume = None - if article_meta.volume: - volume = article_meta.volume.string or None - - issue = None - if article_meta.issue: - issue = article_meta.issue.string or None - - pages = None - if article_meta.find("page-range"): - pages = article_meta.find("page-range").string - elif article_meta.fpage: - pages = article_meta.fpage.string - - language = None - cm = article_meta.find("custom-meta") - if cm.find("meta-name").string == "lang": - language = cm.find("meta-value").string - - release_type = "article-journal" - if "[Abstract]" in title: - release_type = "abstract" - elif "[Editorial" in title: - release_type = "editorial" - elif "[Letter" in title: - release_type = "letter" - elif "[Poem" in title or "[Photograph" in title: - release_type = None - - if title.startswith("[") and title.endswith("]"): - # strip brackets if that is all that is there (eg, translation or non-english) - title = title[1:-1] - - # everything in JSTOR is published - release_status = "published" - - if extra_jstor: - extra['jstor'] = extra_jstor - if not extra: - extra = None - - re = dict( - issn=issn, # not an entity field - #work_id - title=title, - #original_title - release_type=release_type, - release_status=release_status, - release_date=release_date.isoformat(), - release_year=release_year, - doi=doi, - #pmid - #pmcid - #isbn13 # TODO: ? - volume=volume, - issue=issue, - pages=pages, - publisher=publisher, - language=language, - #license_slug # TODO: ? - - # content, mimetype, lang - #abstracts=abstracts, - - # raw_name, role, raw_affiliation, extra - contribs=contribs, - - # key, year, container_name, title, locator - # extra: volume, authors, issue, publisher, identifiers - #refs=refs, - - # name, type, publisher, issnl - # extra: issnp, issne, original_name, languages, country - container=container, - - # extra: - # withdrawn_date - # translation_of - # subtitle - # aliases - # container_name - # group-title - # pubmed: retraction refs - extra=extra, - ) - return re - -if __name__=='__main__': - parser = JstorXmlParser() - parser.parse_file(open(sys.argv[1])) -- cgit v1.2.3