aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-11-17 19:40:54 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-11-19 14:55:15 -0800
commit92db2c8bb2464db8455b61b245a007cb57f2c92f (patch)
tree53ea173f7588810c0461ae72e9703386262aea12 /python
parentc9bfb0be4c7e38b6668f49588f2ffecee7b17912 (diff)
downloadfatcat-92db2c8bb2464db8455b61b245a007cb57f2c92f.tar.gz
fatcat-92db2c8bb2464db8455b61b245a007cb57f2c92f.zip
implement remainder of DOAJ article importer
Diffstat (limited to 'python')
-rwxr-xr-xpython/fatcat_import.py37
-rw-r--r--python/fatcat_tools/importers/doaj_article.py182
-rw-r--r--python/tests/import_doaj.py17
3 files changed, 168 insertions, 68 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 19cf43ec..ff6c94dc 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -256,6 +256,24 @@ def run_datacite(args):
else:
JsonLinePusher(dci, args.json_file).run()
+def run_doaj_article(args):
+ dai = DoajArticleImporter(args.api,
+ args.issn_map_file,
+ edit_batch_size=args.batch_size,
+ do_updates=args.do_updates,
+ )
+ if args.kafka_mode:
+ KafkaJsonPusher(
+ dai,
+ args.kafka_hosts,
+ args.kafka_env,
+ "api-doaj",
+ "fatcat-{}-import-doaj".format(args.kafka_env),
+ consume_batch_size=args.batch_size,
+ ).run()
+ else:
+ JsonLinePusher(dai, args.json_file).run()
+
def run_file_meta(args):
# do_updates defaults to true for this importer
fmi = FileMetaImporter(args.api,
@@ -606,6 +624,25 @@ def main():
auth_var="FATCAT_AUTH_WORKER_DATACITE",
)
+ sub_doaj_article = subparsers.add_parser('doaj-article',
+ help="import doaj.org article metadata")
+ sub_doaj_article.add_argument('json_file',
+ help="File with JSON lines from DOAJ API (or bulk dump) to import from",
+ default=sys.stdin, type=argparse.FileType('r'))
+ sub_doaj_article.add_argument('--issn-map-file',
+ help="ISSN to ISSN-L mapping file",
+ default=None, type=argparse.FileType('r'))
+ sub_doaj_article.add_argument('--kafka-mode',
+ action='store_true',
+ help="consume from kafka topic (not stdin)")
+ sub_doaj_article.add_argument('--do-updates',
+ action='store_true',
+ help="update any pre-existing release entities")
+ sub_doaj_article.set_defaults(
+ func=run_doaj_article,
+ auth_var="FATCAT_AUTH_WORKER_DOAJ",
+ )
+
sub_file_meta = subparsers.add_parser('file-meta',
help="simple update-only importer for file metadata")
sub_file_meta.set_defaults(
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index 74ac9a0e..c0e75283 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -4,17 +4,15 @@ Importer for DOAJ article-level metadata, schema v1.
DOAJ API schema and docs: https://doaj.org/api/v1/docs
"""
-import collections
+import warnings
import datetime
-import sys
-from typing import List, Dict, Optional
-
-import langdetect
+from typing import List, Optional
import fatcat_openapi_client
-from fatcat_tools.normal import clean_doi
-from fatcat_tools.transforms import entity_to_dict
-from fatcat_tools.importers.common import EntityImporter, clean
+from fatcat_tools.normal import (clean_doi, clean_str, parse_month,
+ clean_orcid, detect_text_lang, parse_lang_name, parse_country_name,
+ clean_pmid, clean_pmcid)
+from fatcat_tools.importers.common import EntityImporter
# Cutoff length for abstracts.
MAX_ABSTRACT_LENGTH = 2048
@@ -48,7 +46,6 @@ class DoajArticleImporter(EntityImporter):
def want(self, obj):
return True
-
def parse_record(self, obj):
"""
bibjson {
@@ -74,14 +71,6 @@ class DoajArticleImporter(EntityImporter):
title (string, optional),
volume (string, optional)
}
-
- TODO:
- - release_date
- - container_id
- - issue (number?)
- - license is article license; import as slug
- - "open_access" flag in doaj_meta
- - container lookup from issns ("issns" key)
"""
if not obj or not isinstance(obj, dict) or not 'bibjson' in obj:
@@ -90,42 +79,51 @@ class DoajArticleImporter(EntityImporter):
bibjson = obj['bibjson']
- title = clean(bibjson.get('title'))
+ title = clean_str(bibjson.get('title'), force_xml=True)
if not title:
self.counts['skip-title'] += 1
return False
+ container_name = clean_str(bibjson['journal']['title'])
container_id = None
- container_name = None
-
- volume = clean(bibjson['journal'].get('volume'))
- number = clean(bibjson['journal'].get('number'))
- publisher = clean(bibjson['journal'].get('publisher'))
+ # NOTE: 'issns' not documented in API schema
+ for issn in bibjson['journal']['issns']:
+ issnl = self.issn2issnl(issn)
+ if issnl:
+ container_id = self.lookup_issnl(self.issn2issnl(issn))
+ if container_id:
+ # don't store container_name when we have an exact match
+ container_name = None
+ break
+
+ volume = clean_str(bibjson['journal'].get('volume'))
+ # NOTE: this schema seems to use "number" as "issue number"
+ issue = clean_str(bibjson['journal'].get('number'))
+ publisher = clean_str(bibjson['journal'].get('publisher'))
try:
release_year = int(bibjson.get('year'))
except (TypeError, ValueError):
release_year = None
- # XXX: parse_month
- release_month = clean(bibjson.get('year'))
+ release_month = parse_month(clean_str(bibjson.get('month')))
# block bogus far-future years/dates
if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
release_month = None
release_year = None
- # country
- country = None
- # XXX: country = parse_country(bibjson['journal'].get('country'))
-
- # language
+ license_slug = self.doaj_license_slug(bibjson['journal'].get('license'))
+ country = parse_country_name(bibjson['journal'].get('country'))
language = None
- # XXX: language = parse_language(bibjson['journal'].get('language'))
+ for raw in bibjson['journal'].get('language') or []:
+ language = parse_lang_name(raw)
+ if language:
+ break
# pages
- # TODO: error in API docs? seems like start_page not under 'journal' object
- start_page = clean(bibjson['journal'].get('start_page')) or clean(bibjson.get('start_page'))
- end_page = clean(bibjson['journal'].get('end_page')) or clean(bibjson.get('end_page'))
+ # NOTE: error in API docs? seems like start_page not under 'journal' object
+ start_page = clean_str(bibjson['journal'].get('start_page')) or clean_str(bibjson.get('start_page'))
+ end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str(bibjson.get('end_page'))
pages: Optional[str] = None
if start_page and end_page:
pages = f"{start_page}-{end_page}"
@@ -136,13 +134,13 @@ class DoajArticleImporter(EntityImporter):
ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id)
abstracts = self.doaj_abstracts(bibjson)
contribs = self.doaj_contribs(bibjson.get('author') or [])
-
+
# DOAJ-specific extra
doaj_extra = dict()
if bibjson.get('subject'):
doaj_extra['subject'] = bibjson.get('subject')
if bibjson.get('keywords'):
- doaj_extra['keywords'] = [k for k in [clean(s) for s in bibjson.get('keywords')] if k]
+ doaj_extra['keywords'] = [k for k in [clean_str(s) for s in bibjson.get('keywords')] if k]
# generic extra
extra = dict()
@@ -171,13 +169,12 @@ class DoajArticleImporter(EntityImporter):
ext_ids=ext_ids,
contribs=contribs,
volume=volume,
- number=number, # XXX
- #issue,
+ issue=issue,
pages=pages,
language=language,
abstracts=abstracts,
extra=extra,
- #license_slug=license_slug,
+ license_slug=license_slug,
)
re = self.biblio_hacks(re)
return re
@@ -192,7 +189,7 @@ class DoajArticleImporter(EntityImporter):
def try_update(self, re):
- # lookup existing DOI (don't need to try other ext idents for crossref)
+ # lookup existing release by DOAJ article id
existing = None
try:
existing = self.api.lookup_release(doaj=re.ext_ids.doaj)
@@ -202,13 +199,62 @@ class DoajArticleImporter(EntityImporter):
# doesn't exist, need to update
return True
- # eventually we'll want to support "updates", but for now just skip if
- # entity already exists
- if existing:
+ # then try other ext_id lookups
+ if not existing:
+ for extid_type in ('doi', 'pmid', 'pmcid'):
+ extid_val = re.ext_ids.__dict__[extid_type]
+ if not extid_val:
+ continue
+ try:
+ existing = self.api.lookup_release(**{extid_type: extid_val})
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ if existing:
+ if existing.ext_ids.doaj:
+ warn_str = f"unexpected DOAJ ext_id match after lookup failed doaj={re.ext_ids.doaj} ident={existing.ident}"
+ warnings.warn(warn_str)
+ self.counts["skip-doaj-id-mismatch"] += 1
+ return None
+ break
+
+ # TODO: in the future could do fuzzy match here, eg using elasticsearch
+
+ # create entity
+ if not existing:
+ return True
+
+ # other logic could go here about skipping updates
+ if not self.do_updates or existing.ext_ids.doaj:
self.counts['exists'] += 1
return False
- return True
+ # fields to copy over for update
+ existing.ext_ids.doaj = existing.ext_ids.doaj or re.ext_ids.doaj
+ existing.release_type = existing.release_type or re.release_type
+ existing.release_stage = existing.release_stage or re.release_stage
+ existing.container_id = existing.container_id or re.container_id
+ existing.abstracts = existing.abstracts or re.abstracts
+ existing.extra['doaj'] = re.extra['doaj']
+ existing.volume = existing.volume or re.volume
+ existing.issue = existing.issue or re.issue
+ existing.pages = existing.pages or re.pages
+ existing.language = existing.language or re.language
+
+ try:
+ self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+ self.counts['update'] += 1
+ except fatcat_openapi_client.rest.ApiException as err:
+ # there is a code path where we try to update the same release
+ # twice in a row; if that happens, just skip
+ # NOTE: API behavior might change in the future?
+ if "release_edit_editgroup_id_ident_id_key" in err.body:
+ self.counts['skip-update-conflict'] += 1
+ return False
+ else:
+ raise err
+
+ return False
def insert_batch(self, batch):
self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
@@ -218,19 +264,13 @@ class DoajArticleImporter(EntityImporter):
entity_list=batch))
def doaj_abstracts(self, bibjson: dict) -> List[fatcat_openapi_client.ReleaseAbstract]:
- text = clean(bibjson['abstract'])
+ text = clean_str(bibjson.get('abstract'))
if not text or len(text) < 10:
return []
if len(text) > MAX_ABSTRACT_LENGTH:
text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
- # Detect language. This is fuzzy and may be removed, if too unreliable.
- lang = None
- try:
- lang = langdetect.detect(text)
- except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err:
- #print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr)
- pass
+ lang = detect_text_lang(text)
abstract = fatcat_openapi_client.ReleaseAbstract(
mimetype="text/plain",
@@ -249,15 +289,22 @@ class DoajArticleImporter(EntityImporter):
}
"""
contribs = []
- # TODO: index?
+ index = 0
for author in authors:
if not author.get('name'):
continue
+ creator_id = None
+ orcid = clean_orcid(author.get('orcid_id'))
+ if orcid:
+ creator_id = self.lookup_orcid(orcid)
contribs.append(fatcat_openapi_client.ReleaseContrib(
raw_name=author.get('name'),
- # XXX: orcid_id=author.get('orcid_id') or None,
- # XXX: affiliation=author.get('affiliation') or None,
+ role='author',
+ index=index,
+ creator_id=creator_id,
+ raw_affiliation=clean_str(author.get('affiliation')),
))
+ index += 1
return contribs
def doaj_ext_ids(self, identifiers: List[dict], doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds:
@@ -277,9 +324,9 @@ class DoajArticleImporter(EntityImporter):
if id_obj['type'].lower() == 'doi':
doi = clean_doi(id_obj['id'])
elif id_obj['type'].lower() == 'pmid':
- pmid = id_obj['id']
+ pmid = clean_pmid(id_obj['id'])
elif id_obj['type'].lower() == 'pmcid':
- pmcid = id_obj['id']
+ pmcid = clean_pmcid(id_obj['id'])
return fatcat_openapi_client.ReleaseExtIds(
doaj=doaj_article_id,
@@ -287,3 +334,24 @@ class DoajArticleImporter(EntityImporter):
pmid=pmid,
pmcid=pmcid,
)
+
+ def doaj_license_slug(self, license_list: List[dict]) -> Optional[str]:
+ """
+ bibjson.journal.license {
+ open_access (boolean, optional),
+ title (string, optional),
+ type (string, optional),
+ url (string, optional),
+ version (string, optional)
+ }
+ """
+ if not license_list:
+ return None
+ for license in license_list:
+ if not license.get('open_access'):
+ continue
+ slug = license.get('type')
+ if slug.startswith('CC '):
+ slug = slug.replace('CC ', 'cc-').lower()
+ return slug
+ return None
diff --git a/python/tests/import_doaj.py b/python/tests/import_doaj.py
index a75b574e..bceb1343 100644
--- a/python/tests/import_doaj.py
+++ b/python/tests/import_doaj.py
@@ -60,7 +60,7 @@ def test_doaj_dict_parse(doaj_importer):
assert r.publisher == "Elsevier"
assert r.release_type == "article-journal"
assert r.release_stage == "published"
- # XXX: assert r.license_slug == "cc-by-nc-nd"
+ assert r.license_slug == "cc-by-nc-nd"
assert r.original_title == None
assert r.ext_ids.doi == "10.1016/j.matdes.2016.06.110"
assert r.ext_ids.doaj == "e58f08a11ecb495ead55a44ad4f89808"
@@ -71,9 +71,9 @@ def test_doaj_dict_parse(doaj_importer):
assert r.number == None
assert r.pages == "608-617"
assert r.version == None
- # XXX: assert r.language == "en"
+ assert r.language == "en"
# matched by ISSN, so wouldn't be defined normally
- # XXX: assert r.extra['container_name'] == "Materials & Design"
+ assert r.extra['container_name'] == "Materials & Design"
assert len(r.abstracts) == 1
assert len(r.abstracts[0].content) == 1033
assert len(r.contribs) == 5
@@ -82,11 +82,6 @@ def test_doaj_dict_parse(doaj_importer):
assert r.contribs[0].surname == None
assert not r.refs
- print(r.extra)
- # XXX: assert r.extra['release_month'] == 10
- # XXX: assert r.extra['country'] == 'gb'
-
- #assert r.extra["doaj"]["subjects"] == [
- # {"subject": "Plant Genetic Resource for Food and Agriculture"}
- #]
-
+ #print(r.extra)
+ assert r.extra['release_month'] == 10
+ assert r.extra['country'] == 'gb'