summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-15 15:06:35 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:41:29 -0700
commit82ea0b02b29d6be0542eb43f00710a23ed8516c2 (patch)
tree5be411a5c9a4880c36e2b7e764e1954e84076abb /python/fatcat_tools/importers
parent82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a (diff)
downloadfatcat-82ea0b02b29d6be0542eb43f00710a23ed8516c2.tar.gz
fatcat-82ea0b02b29d6be0542eb43f00710a23ed8516c2.zip
initial JSTOR importer
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/__init__.py1
-rw-r--r--python/fatcat_tools/importers/jstor.py270
2 files changed, 271 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index ecbfe38e..497946ea 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -15,6 +15,7 @@ To run an import you combine two classes; one each of:
from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk
from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
from .jalc import JalcImporter
+from .jstor import JstorImporter
from .grobid_metadata import GrobidMetadataImporter
from .journal_metadata import JournalMetadataImporter
from .matched import MatchedImporter
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
new file mode 100644
index 00000000..6ab320c3
--- /dev/null
+++ b/python/fatcat_tools/importers/jstor.py
@@ -0,0 +1,270 @@
+
+
+import sys
+import json
+import sqlite3
+import datetime
+import itertools
+import subprocess
+from bs4 import BeautifulSoup
+
+import fatcat_client
+from .common import EntityImporter, clean
+
+# is this just ISO 3-char to ISO 2-char?
+# XXX: more entries
+JSTOR_LANG_MAP = {
+ 'eng': 'en',
+}
+
+# XXX: more entries
+JSTOR_CONTRIB_MAP = {
+ 'author': 'author',
+ 'editor': 'editor',
+ 'translator': 'translator',
+ 'illustrator': 'illustrator',
+}
+
+class JstorImporter(EntityImporter):
+ """
+ Importer for JSTOR bulk XML metadata (eg, from their Early Journals
+ Collection)
+ """
+
+ def __init__(self, api, issn_map_file, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of JSTOR XML metadata")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JstorImporter')
+ super().__init__(api,
+ issn_map_file=issn_map_file,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+
+ self.create_containers = kwargs.get('create_containers')
+
+ self.read_issn_map_file(issn_map_file)
+
+ def want(self, obj):
+ return True
+
+ def parse_record(self, article):
+
+ journal_meta = article.front.find("journal-meta")
+ article_meta = article.front.find("article-meta")
+
+ extra = dict()
+ extra_jstor = dict()
+
+ journal_title = journal_meta.find("journal-title").string
+ publisher = journal_meta.find("publisher-name").string
+ issn = journal_meta.find("issn")
+ if issn:
+ issn = issn.string
+ if len(issn) == 8:
+ issn = "{}-{}".format(issn[0:4], issn[4:8])
+ else:
+ assert len(issn) == 9
+ # XXX:
+ container_id = None
+ container = dict(
+ name=journal_title,
+ publisher=publisher,
+ issn=issn, # TODO: ISSN-L lookup...
+ )
+
+ doi = article_meta.find("article-id", {"pub-id-type": "doi"})
+ if doi:
+ doi = doi.string.lower().strip()
+
+ jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"})
+ if jstor_id:
+ jstor_id = jstor_id.string
+
+ title = article_meta.find("article-title")
+ if title:
+ title = title.string.strip()
+ if title.endswith('.'):
+ title = title[:-1]
+
+ contribs = []
+ cgroup = article_meta.find("contrib-group")
+ if cgroup:
+ for c in cgroup.find_all("contrib"):
+ given = c.find("given-names")
+ surname = c.find("surname")
+ if given and surname:
+ name = "{} {}".format(given.string, surname.string)
+ elif surname:
+ name = surname.string
+ else:
+ name = None
+ contribs.append(fatcat_client.ReleaseContrib(
+ role=JSTOR_CONTRIB_MAP.get(c['contrib-type']),
+ raw_name=clean(name),
+ given_name=clean(given.string),
+ surname=clean(surname.string),
+ ))
+
+ release_year = None
+ release_date = None
+ pub_date = article_meta.find('pub-date')
+ if pub_date and pub_date.year:
+ release_year = int(pub_date.year.string)
+ if pub_date.month and pub_date.day:
+ release_date = datetime.date(
+ release_year,
+ int(pub_date.month.string),
+ int(pub_date.day.string))
+
+ volume = None
+ if article_meta.volume:
+ volume = article_meta.volume.string or None
+
+ issue = None
+ if article_meta.issue:
+ issue = article_meta.issue.string or None
+
+ pages = None
+ if article_meta.find("page-range"):
+ pages = article_meta.find("page-range").string
+ elif article_meta.fpage:
+ pages = article_meta.fpage.string
+
+ language = None
+ cm = article_meta.find("custom-meta")
+ if cm.find("meta-name").string == "lang":
+ language = cm.find("meta-value").string
+ language = JSTOR_LANG_MAP.get(language)
+
+ release_type = "article-journal"
+ if "[Abstract]" in title:
+ # TODO: strip the "[Abstract]" bit?
+ release_type = "abstract"
+ elif "[Editorial" in title:
+ release_type = "editorial"
+ elif "[Letter" in title:
+ release_type = "letter"
+ elif "[Poem" in title or "[Photograph" in title:
+ release_type = None
+
+ if title.startswith("[") and title.endswith("]"):
+ # strip brackets if that is all that is there (eg, translation or non-english)
+ title = title[1:-1]
+
+ # JSTOR issue-id
+ if article_meta.find('issue-id'):
+ issue_id = clean(article_meta.find('issue-id').string)
+ if issue_id:
+ extra_jstor['issue_id'] = issue_id
+
+ # JSTOR journal-id
+ # XXX:
+
+ # everything in JSTOR is published
+ release_stage = "published"
+
+ # extra:
+ # withdrawn_date
+ # translation_of
+ # subtitle
+ # aliases
+ # container_name
+ # group-title
+ # pubmed: retraction refs
+ if extra_jstor:
+ extra['jstor'] = extra_jstor
+ if not extra:
+ extra = None
+
+ re = fatcat_client.ReleaseEntity(
+ #work_id
+ title=title,
+ #original_title
+ release_type=release_type,
+ release_stage=release_stage,
+ release_date=release_date.isoformat(),
+ release_year=release_year,
+ ext_ids=fatcat_client.ReleaseExtIds(
+ doi=doi,
+ jstor=jstor_id,
+ ),
+ volume=volume,
+ issue=issue,
+ pages=pages,
+ publisher=publisher,
+ language=language,
+ #license_slug
+
+ # content, mimetype, lang
+ #abstracts=abstracts,
+
+ contribs=contribs,
+
+ # key, year, container_name, title, locator
+ # extra: volume, authors, issue, publisher, identifiers
+ #refs=refs,
+
+ # name, type, publisher, issnl
+ # extra: issnp, issne, original_name, languages, country
+ container_id=container_id,
+
+ extra=extra,
+ )
+ return re
+
+ def try_update(self, re):
+
+ # first, lookup existing by JSTOR id (which much be defined)
+ existing = None
+ try:
+ existing = self.api.lookup_release(jstor=re.ext_ids.jstor)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ # then try DOI lookup if there is one
+ if not existing and re.ext_ids.doi:
+ try:
+ existing = self.api.lookup_release(jstor=re.ext_ids.jstor)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if existing and existing.ext_ids.jstor:
+ # don't update if it already has JSTOR ID
+ self.counts['exists'] += 1
+ return False
+ elif existing:
+ # but do update if only DOI was set
+ existing.ext_ids.jstor = re.jstor_id
+ existing.extra['jstor'] = re.extra['jstor']
+ self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+ self.counts['update'] += 1
+ return False
+
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch(
+ editgroup=fatcat_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
+
+ def parse_file(self, handle):
+
+ # 1. open with beautiful soup
+ soup = BeautifulSoup(handle, "xml")
+
+ # 2. iterate over articles, call parse_article on each
+ for article in soup.find_all("article"):
+ resp = self.parse_record(article)
+ print(json.dumps(resp))
+ #sys.exit(-1)
+
+if __name__=='__main__':
+ parser = JstorImporter()
+ parser.parse_file(open(sys.argv[1]))