summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/jstor.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-22 12:21:07 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-22 12:21:07 -0700
commit77abede313c97eefc5cef16dabc3213df7000b16 (patch)
treee7218a38f72285af1953fdd23e39c467f5464b2c /python/fatcat_tools/importers/jstor.py
parentd33c8cf05e3c9732b04f56cf356180b9d76e04e0 (diff)
downloadfatcat-77abede313c97eefc5cef16dabc3213df7000b16.tar.gz
fatcat-77abede313c97eefc5cef16dabc3213df7000b16.zip
JSTOR importer polish
Diffstat (limited to 'python/fatcat_tools/importers/jstor.py')
-rw-r--r--python/fatcat_tools/importers/jstor.py69
1 files changed, 51 insertions, 18 deletions
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index c8a7b20a..c846cbde 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
import fatcat_client
from .common import EntityImporter, clean, LANG_MAP_MARC
+from .crossref import CONTAINER_TYPE_MAP
# TODO: more entries?
JSTOR_CONTRIB_MAP = {
@@ -16,6 +17,14 @@ JSTOR_CONTRIB_MAP = {
'illustrator': 'illustrator',
}
+JSTOR_TYPE_MAP = {
+ "book-review": "review-book",
+ "editorial": "editorial",
+ "misc": "stub",
+ "news": "article",
+ "research-article": "article-journal",
+}
+
class JstorImporter(EntityImporter):
"""
Importer for JSTOR bulk XML metadata (eg, from their Early Journals
@@ -38,6 +47,9 @@ class JstorImporter(EntityImporter):
self.read_issn_map_file(issn_map_file)
+ def map_container_type(self, crossref_type):
+ return CONTAINER_TYPE_MAP.get(crossref_type)
+
def want(self, obj):
return True
@@ -49,13 +61,22 @@ class JstorImporter(EntityImporter):
extra = dict()
extra_jstor = dict()
+ release_type = JSTOR_TYPE_MAP.get(article['article-type'])
title = article_meta.find("article-title")
- if title:
+ if title and title.string:
title = title.string.strip()
- if title.endswith('.'):
- title = title[:-1]
+ elif title and not title.string:
+ title = None
+
+ if not title and release_type.startswith('review') and article_meta.product.source:
+ title = "Review: {}".format(article_meta.product.source.string)
+
+ if not title:
+ return None
+
+ if title.endswith('.'):
+ title = title[:-1]
- release_type = "article-journal"
if "[Abstract]" in title:
# TODO: strip the "[Abstract]" bit?
release_type = "abstract"
@@ -108,28 +129,40 @@ class JstorImporter(EntityImporter):
jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"})
if jstor_id:
- jstor_id = jstor_id.string
+ jstor_id = jstor_id.string.strip()
+ if not jstor_id and doi:
+ assert doi.startswith('10.2307/')
+ jstor_id = doi.replace('10.2307/', '')
+ assert jstor_id and int(jstor_id)
contribs = []
cgroup = article_meta.find("contrib-group")
if cgroup:
for c in cgroup.find_all("contrib"):
given = c.find("given-names")
+ if given:
+ given = clean(given.string)
surname = c.find("surname")
- if given and surname:
- name = "{} {}".format(given.string, surname.string)
- elif surname:
- name = surname.string
- else:
- name = None
- role = JSTOR_CONTRIB_MAP.get(c['contrib-type'])
- if not role and c['contrib-type']:
+ if surname:
+ surname = clean(surname.string)
+ raw_name = c.find("string-name")
+ if raw_name:
+ raw_name = clean(raw_name.string)
+
+ if not raw_name:
+ if given and surname:
+ raw_name = "{} {}".format(given, surname)
+ elif surname:
+ raw_name = surname
+
+ role = JSTOR_CONTRIB_MAP.get(c.get('contrib-type', 'author'))
+ if not role and c.get('contrib-type'):
sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(c['contrib-type']))
contribs.append(fatcat_client.ReleaseContrib(
- role=JSTOR_CONTRIB_MAP.get(c['contrib-type']),
- raw_name=clean(name),
- given_name=clean(given.string),
- surname=clean(surname.string),
+ role=role,
+ raw_name=raw_name,
+ given_name=given,
+ surname=surname,
))
release_year = None
@@ -164,7 +197,7 @@ class JstorImporter(EntityImporter):
language = None
cm = article_meta.find("custom-meta")
if cm.find("meta-name").string == "lang":
- language = cm.find("meta-value").string
+ language = cm.find("meta-value").string.split()[0]
language = LANG_MAP_MARC.get(language)
if not language:
warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))