aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-15 15:06:35 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:41:29 -0700
commit82ea0b02b29d6be0542eb43f00710a23ed8516c2 (patch)
tree5be411a5c9a4880c36e2b7e764e1954e84076abb /python
parent82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a (diff)
downloadfatcat-82ea0b02b29d6be0542eb43f00710a23ed8516c2.tar.gz
fatcat-82ea0b02b29d6be0542eb43f00710a23ed8516c2.zip
initial JSTOR importer
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/importers/__init__.py1
-rw-r--r--python/fatcat_tools/importers/jstor.py (renamed from python/parse_jstor_xml.py)170
2 files changed, 132 insertions, 39 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index ecbfe38e..497946ea 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -15,6 +15,7 @@ To run an import you combine two classes; one each of:
from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk
from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
from .jalc import JalcImporter
+from .jstor import JstorImporter
from .grobid_metadata import GrobidMetadataImporter
from .journal_metadata import JournalMetadataImporter
from .matched import MatchedImporter
diff --git a/python/parse_jstor_xml.py b/python/fatcat_tools/importers/jstor.py
index 04f2b18e..6ab320c3 100644
--- a/python/parse_jstor_xml.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -1,31 +1,56 @@
+
import sys
import json
+import sqlite3
import datetime
+import itertools
+import subprocess
from bs4 import BeautifulSoup
-from bs4.element import NavigableString
+import fatcat_client
+from .common import EntityImporter, clean
+
+# is this just ISO 3-char to ISO 2-char?
+# XXX: more entries
+JSTOR_LANG_MAP = {
+ 'eng': 'en',
+}
-class JstorXmlParser():
+# XXX: more entries
+JSTOR_CONTRIB_MAP = {
+ 'author': 'author',
+ 'editor': 'editor',
+ 'translator': 'translator',
+ 'illustrator': 'illustrator',
+}
+
+class JstorImporter(EntityImporter):
"""
- Converts JSTOR bulk XML metadata (eg, from their Early Journals Collection)
+ Importer for JSTOR bulk XML metadata (eg, from their Early Journals
+ Collection)
"""
- def __init__(self):
- pass
+ def __init__(self, api, issn_map_file, **kwargs):
- def parse_file(self, handle):
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of JSTOR XML metadata")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JstorImporter')
+ super().__init__(api,
+ issn_map_file=issn_map_file,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
- # 1. open with beautiful soup
- soup = BeautifulSoup(handle, "xml")
+ self.create_containers = kwargs.get('create_containers')
- # 2. iterate over articles, call parse_article on each
- for article in soup.find_all("article"):
- resp = self.parse_article(article)
- print(json.dumps(resp))
- #sys.exit(-1)
+ self.read_issn_map_file(issn_map_file)
- def parse_article(self, article):
+ def want(self, obj):
+ return True
+
+ def parse_record(self, article):
journal_meta = article.front.find("journal-meta")
article_meta = article.front.find("article-meta")
@@ -42,16 +67,22 @@ class JstorXmlParser():
issn = "{}-{}".format(issn[0:4], issn[4:8])
else:
assert len(issn) == 9
+ # XXX:
+ container_id = None
container = dict(
name=journal_title,
publisher=publisher,
issn=issn, # TODO: ISSN-L lookup...
)
- doi = article_meta.find("article-id", attr={"pub-id-type": "doi"})
+ doi = article_meta.find("article-id", {"pub-id-type": "doi"})
if doi:
doi = doi.string.lower().strip()
+ jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"})
+ if jstor_id:
+ jstor_id = jstor_id.string
+
title = article_meta.find("article-title")
if title:
title = title.string.strip()
@@ -70,9 +101,11 @@ class JstorXmlParser():
name = surname.string
else:
name = None
- contribs.append(dict(
- role=c['contrib-type'], # XXX: types? mapping?
- raw_name=name,
+ contribs.append(fatcat_client.ReleaseContrib(
+ role=JSTOR_CONTRIB_MAP.get(c['contrib-type']),
+ raw_name=clean(name),
+ given_name=clean(given.string),
+ surname=clean(surname.string),
))
release_year = None
@@ -104,9 +137,11 @@ class JstorXmlParser():
cm = article_meta.find("custom-meta")
if cm.find("meta-name").string == "lang":
language = cm.find("meta-value").string
+ language = JSTOR_LANG_MAP.get(language)
release_type = "article-journal"
if "[Abstract]" in title:
+ # TODO: strip the "[Abstract]" bit?
release_type = "abstract"
elif "[Editorial" in title:
release_type = "editorial"
@@ -119,38 +154,53 @@ class JstorXmlParser():
# strip brackets if that is all that is there (eg, translation or non-english)
title = title[1:-1]
+ # JSTOR issue-id
+ if article_meta.find('issue-id'):
+ issue_id = clean(article_meta.find('issue-id').string)
+ if issue_id:
+ extra_jstor['issue_id'] = issue_id
+
+ # JSTOR journal-id
+ # XXX:
+
# everything in JSTOR is published
- release_status = "published"
+ release_stage = "published"
+ # extra:
+ # withdrawn_date
+ # translation_of
+ # subtitle
+ # aliases
+ # container_name
+ # group-title
+ # pubmed: retraction refs
if extra_jstor:
extra['jstor'] = extra_jstor
if not extra:
extra = None
- re = dict(
- issn=issn, # not an entity field
+ re = fatcat_client.ReleaseEntity(
#work_id
title=title,
#original_title
release_type=release_type,
- release_status=release_status,
+ release_stage=release_stage,
release_date=release_date.isoformat(),
release_year=release_year,
- doi=doi,
- #pmid
- #pmcid
- #isbn13 # TODO: ?
+ ext_ids=fatcat_client.ReleaseExtIds(
+ doi=doi,
+ jstor=jstor_id,
+ ),
volume=volume,
issue=issue,
pages=pages,
publisher=publisher,
language=language,
- #license_slug # TODO: ?
+ #license_slug
# content, mimetype, lang
#abstracts=abstracts,
- # raw_name, role, raw_affiliation, extra
contribs=contribs,
# key, year, container_name, title, locator
@@ -159,20 +209,62 @@ class JstorXmlParser():
# name, type, publisher, issnl
# extra: issnp, issne, original_name, languages, country
- container=container,
-
- # extra:
- # withdrawn_date
- # translation_of
- # subtitle
- # aliases
- # container_name
- # group-title
- # pubmed: retraction refs
+ container_id=container_id,
+
extra=extra,
)
return re
+ def try_update(self, re):
+
+ # first, lookup existing by JSTOR id (which much be defined)
+ existing = None
+ try:
+ existing = self.api.lookup_release(jstor=re.ext_ids.jstor)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ # then try DOI lookup if there is one
+ if not existing and re.ext_ids.doi:
+ try:
+ existing = self.api.lookup_release(jstor=re.ext_ids.jstor)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if existing and existing.ext_ids.jstor:
+ # don't update if it already has JSTOR ID
+ self.counts['exists'] += 1
+ return False
+ elif existing:
+ # but do update if only DOI was set
+ existing.ext_ids.jstor = re.jstor_id
+ existing.extra['jstor'] = re.extra['jstor']
+ self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+ self.counts['update'] += 1
+ return False
+
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch(
+ editgroup=fatcat_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
+
+ def parse_file(self, handle):
+
+ # 1. open with beautiful soup
+ soup = BeautifulSoup(handle, "xml")
+
+ # 2. iterate over articles, call parse_article on each
+ for article in soup.find_all("article"):
+ resp = self.parse_record(article)
+ print(json.dumps(resp))
+ #sys.exit(-1)
+
if __name__=='__main__':
- parser = JstorXmlParser()
+ parser = JstorImporter()
parser.parse_file(open(sys.argv[1]))