summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:25:03 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:41:29 -0700
commit9688cedac61729bc417a3cb31096f52bdb6f16db (patch)
tree4a504fec129547c107e47316bf5e25560f810ca8
parent0ec3fc58b4394102ffaaf385e6048a6412a9c9b7 (diff)
downloadfatcat-9688cedac61729bc417a3cb31096f52bdb6f16db.tar.gz
fatcat-9688cedac61729bc417a3cb31096f52bdb6f16db.zip
JSTOR importer polish
-rw-r--r--python/fatcat_tools/importers/jstor.py52
-rw-r--r--python/tests/import_jstor.py10
2 files changed, 43 insertions, 19 deletions
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index fd1decf7..4531d926 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -8,7 +8,7 @@ from bs4 import BeautifulSoup
import fatcat_client
from .common import EntityImporter, clean, LANG_MAP_MARC
-# XXX: more entries
+# TODO: more entries?
JSTOR_CONTRIB_MAP = {
'author': 'author',
'editor': 'editor',
@@ -49,6 +49,11 @@ class JstorImporter(EntityImporter):
extra = dict()
extra_jstor = dict()
+ # JSTOR journal-id
+ journal_ids = [j.string for j in journal_meta.find_all('journal-id')]
+ if journal_ids:
+ extra_jstor['journal_ids'] = journal_ids
+
journal_title = journal_meta.find("journal-title").string
publisher = journal_meta.find("publisher-name").string
issn = journal_meta.find("issn")
@@ -58,13 +63,24 @@ class JstorImporter(EntityImporter):
issn = "{}-{}".format(issn[0:4], issn[4:8])
else:
assert len(issn) == 9
- # XXX:
+
+ issnl = self.issn2issnl(issn)
container_id = None
- container = dict(
- name=journal_title,
- publisher=publisher,
- issn=issn, # TODO: ISSN-L lookup...
- )
+ if issnl:
+ container_id = self.lookup_issnl(issnl)
+
+ # create container if it doesn't exist
+ if (container_id is None and self.create_containers and (issnl is not None)
+ and journal_title):
+ ce = fatcat_client.ContainerEntity(
+ issnl=issnl,
+ publisher=publisher,
+ container_type=self.map_container_type(release_type),
+ name=clean(journal_title, force_xml=True),
+ extra=journal_extra)
+ ce_edit = self.create_container(ce)
+ container_id = ce_edit.ident
+ self._issnl_id_map[issnl] = container_id
doi = article_meta.find("article-id", {"pub-id-type": "doi"})
if doi:
@@ -92,6 +108,9 @@ class JstorImporter(EntityImporter):
name = surname.string
else:
name = None
+ role = JSTOR_CONTRIB_MAP.get(c['contrib-type'])
+ if not role and c['contrib-type']:
+ sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}".format(c['contrib-type']))
contribs.append(fatcat_client.ReleaseContrib(
role=JSTOR_CONTRIB_MAP.get(c['contrib-type']),
raw_name=clean(name),
@@ -109,6 +128,10 @@ class JstorImporter(EntityImporter):
release_year,
int(pub_date.month.string),
int(pub_date.day.string))
+ if release_date.day == 1 and release_date.month == 1:
+ # suspect jan 1st dates get set by JSTOR when actual
+ # date not known (citation needed), so drop them
+ release_date = None
volume = None
if article_meta.volume:
@@ -153,9 +176,6 @@ class JstorImporter(EntityImporter):
if issue_id:
extra_jstor['issue_id'] = issue_id
- # JSTOR journal-id
- # XXX:
-
# everything in JSTOR is published
release_stage = "published"
@@ -178,7 +198,7 @@ class JstorImporter(EntityImporter):
#original_title
release_type=release_type,
release_stage=release_stage,
- release_date=release_date.isoformat(),
+ release_date=release_date,
release_year=release_year,
ext_ids=fatcat_client.ReleaseExtIds(
doi=doi,
@@ -218,10 +238,14 @@ class JstorImporter(EntityImporter):
if err.status != 404:
raise err
- # then try DOI lookup if there is one
- if not existing and re.ext_ids.doi:
+ # then try DOI lookup if there is one (try JSTOR prefix+jstor_id if
+ # there isn't a DOI set)
+ if not existing:
+ doi = re.ext_ids.doi
+ if not doi:
+ doi = "10.2307/{}".format(re.ext_ids.jstor)
try:
- existing = self.api.lookup_release(doi=re.ext_ids.doi)
+ existing = self.api.lookup_release(doi=doi)
except fatcat_client.rest.ApiException as err:
if err.status != 404:
raise err
diff --git a/python/tests/import_jstor.py b/python/tests/import_jstor.py
index b1b86a44..019f0aae 100644
--- a/python/tests/import_jstor.py
+++ b/python/tests/import_jstor.py
@@ -56,22 +56,22 @@ def test_jstor_xml_parse(jstor_importer):
assert r.release_type == "abstract"
assert r.release_stage == "published"
assert r.license_slug == None
- # XXX: assert r.ext_ids.doi == "10.2307/111039"
+ assert r.ext_ids.doi == None
assert r.ext_ids.jstor == "111039"
assert r.language == "en"
assert r.volume == "5"
assert r.issue == None
assert r.pages == "831-832"
- # TODO: None if published jan 1st?
- assert str(r.release_date) == "1843-01-01"
+ # None because jan 1st
+ assert r.release_date == None
assert r.release_year == 1843
# matched by ISSN, so shouldn't be in there?
#assert extra['container_name'] == "Abstracts of the Papers Communicated to the Royal Society of London"
assert len(r.contribs) == 1
- # XXX: extra['jstor'] stuff
+ assert r.extra['jstor']['journal_ids'] == ['abstpapecommroya', 'j100687']
assert r.contribs[0].raw_name == "John Kinnersley Smythies"
assert r.contribs[0].given_name == "John Kinnersley"
assert r.contribs[0].surname == "Smythies"
- assert not r.refs
+ assert r.refs == None