diff options
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/jstor.py | 270 |
2 files changed, 271 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index ecbfe38e..497946ea 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -15,6 +15,7 @@ To run an import you combine two classes; one each of: from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP from .jalc import JalcImporter +from .jstor import JstorImporter from .grobid_metadata import GrobidMetadataImporter from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py new file mode 100644 index 00000000..6ab320c3 --- /dev/null +++ b/python/fatcat_tools/importers/jstor.py @@ -0,0 +1,270 @@ + + +import sys +import json +import sqlite3 +import datetime +import itertools +import subprocess +from bs4 import BeautifulSoup + +import fatcat_client +from .common import EntityImporter, clean + +# is this just ISO 3-char to ISO 2-char? +# XXX: more entries +JSTOR_LANG_MAP = { + 'eng': 'en', +} + +# XXX: more entries +JSTOR_CONTRIB_MAP = { + 'author': 'author', + 'editor': 'editor', + 'translator': 'translator', + 'illustrator': 'illustrator', +} + +class JstorImporter(EntityImporter): + """ + Importer for JSTOR bulk XML metadata (eg, from their Early Journals + Collection) + """ + + def __init__(self, api, issn_map_file, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of JSTOR XML metadata") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JstorImporter') + super().__init__(api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + self.create_containers = kwargs.get('create_containers') + + self.read_issn_map_file(issn_map_file) + + def want(self, obj): + return True + + def parse_record(self, article): + + journal_meta = article.front.find("journal-meta") + article_meta = article.front.find("article-meta") + + extra = dict() + extra_jstor = dict() + + journal_title = journal_meta.find("journal-title").string + publisher = journal_meta.find("publisher-name").string + issn = journal_meta.find("issn") + if issn: + issn = issn.string + if len(issn) == 8: + issn = "{}-{}".format(issn[0:4], issn[4:8]) + else: + assert len(issn) == 9 + # XXX: + container_id = None + container = dict( + name=journal_title, + publisher=publisher, + issn=issn, # TODO: ISSN-L lookup... + ) + + doi = article_meta.find("article-id", {"pub-id-type": "doi"}) + if doi: + doi = doi.string.lower().strip() + + jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"}) + if jstor_id: + jstor_id = jstor_id.string + + title = article_meta.find("article-title") + if title: + title = title.string.strip() + if title.endswith('.'): + title = title[:-1] + + contribs = [] + cgroup = article_meta.find("contrib-group") + if cgroup: + for c in cgroup.find_all("contrib"): + given = c.find("given-names") + surname = c.find("surname") + if given and surname: + name = "{} {}".format(given.string, surname.string) + elif surname: + name = surname.string + else: + name = None + contribs.append(fatcat_client.ReleaseContrib( + role=JSTOR_CONTRIB_MAP.get(c['contrib-type']), + raw_name=clean(name), + given_name=clean(given.string), + surname=clean(surname.string), + )) + + release_year = None + release_date = None + pub_date = article_meta.find('pub-date') + if pub_date and pub_date.year: + release_year = int(pub_date.year.string) + if pub_date.month and pub_date.day: + release_date = datetime.date( + release_year, + int(pub_date.month.string), + int(pub_date.day.string)) + + volume = None + if article_meta.volume: + volume = article_meta.volume.string or None + + issue = None + if article_meta.issue: + issue = article_meta.issue.string or None + + pages = None + if article_meta.find("page-range"): + pages = article_meta.find("page-range").string + elif article_meta.fpage: + pages = article_meta.fpage.string + + language = None + cm = article_meta.find("custom-meta") + if cm.find("meta-name").string == "lang": + language = cm.find("meta-value").string + language = JSTOR_LANG_MAP.get(language) + + release_type = "article-journal" + if "[Abstract]" in title: + # TODO: strip the "[Abstract]" bit? + release_type = "abstract" + elif "[Editorial" in title: + release_type = "editorial" + elif "[Letter" in title: + release_type = "letter" + elif "[Poem" in title or "[Photograph" in title: + release_type = None + + if title.startswith("[") and title.endswith("]"): + # strip brackets if that is all that is there (eg, translation or non-english) + title = title[1:-1] + + # JSTOR issue-id + if article_meta.find('issue-id'): + issue_id = clean(article_meta.find('issue-id').string) + if issue_id: + extra_jstor['issue_id'] = issue_id + + # JSTOR journal-id + # XXX: + + # everything in JSTOR is published + release_stage = "published" + + # extra: + # withdrawn_date + # translation_of + # subtitle + # aliases + # container_name + # group-title + # pubmed: retraction refs + if extra_jstor: + extra['jstor'] = extra_jstor + if not extra: + extra = None + + re = fatcat_client.ReleaseEntity( + #work_id + title=title, + #original_title + release_type=release_type, + release_stage=release_stage, + release_date=release_date.isoformat(), + release_year=release_year, + ext_ids=fatcat_client.ReleaseExtIds( + doi=doi, + jstor=jstor_id, + ), + volume=volume, + issue=issue, + pages=pages, + publisher=publisher, + language=language, + #license_slug + + # content, mimetype, lang + #abstracts=abstracts, + + contribs=contribs, + + # key, year, container_name, title, locator + # extra: volume, authors, issue, publisher, identifiers + #refs=refs, + + # name, type, publisher, issnl + # extra: issnp, issne, original_name, languages, country + container_id=container_id, + + extra=extra, + ) + return re + + def try_update(self, re): + + # first, lookup existing by JSTOR id (which much be defined) + existing = None + try: + existing = self.api.lookup_release(jstor=re.ext_ids.jstor) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + # then try DOI lookup if there is one + if not existing and re.ext_ids.doi: + try: + existing = self.api.lookup_release(jstor=re.ext_ids.jstor) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + if existing and existing.ext_ids.jstor: + # don't update if it already has JSTOR ID + self.counts['exists'] += 1 + return False + elif existing: + # but do update if only DOI was set + existing.ext_ids.jstor = re.jstor_id + existing.extra['jstor'] = re.extra['jstor'] + self.api.update_release(self.get_editgroup_id(), existing.ident, existing) + self.counts['update'] += 1 + return False + + return True + + def insert_batch(self, batch): + self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch( + editgroup=fatcat_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + + def parse_file(self, handle): + + # 1. open with beautiful soup + soup = BeautifulSoup(handle, "xml") + + # 2. iterate over articles, call parse_article on each + for article in soup.find_all("article"): + resp = self.parse_record(article) + print(json.dumps(resp)) + #sys.exit(-1) + +if __name__=='__main__': + parser = JstorImporter() + parser.parse_file(open(sys.argv[1])) |