summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2019-12-21 23:30:56 +0100
committerMartin Czygan <martin.czygan@gmail.com>2019-12-28 23:07:31 +0100
commita196435a0e88f85785742cdd089344f97401b43a (patch)
tree056dceaa4ccd567096b2d3e789efdd573682a8c3 /python/fatcat_tools
parent52eabd48658a676ac4577d1c8da31df1fe58093e (diff)
downloadfatcat-a196435a0e88f85785742cdd089344f97401b43a.tar.gz
fatcat-a196435a0e88f85785742cdd089344f97401b43a.zip
address first round of MR14 comments
* add missing langdetect * use entity_to_dict for json debug output * factor out code for fields in function and add table driven tests * update citeproc types * add author as default role * add raw_affiliation * include relations from datacite * remove url (covered by doi already) Using yapf for python formatting.
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/datacite.py467
1 files changed, 319 insertions, 148 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 77ce1012..19b89edf 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -14,6 +14,7 @@ import langcodes
import langdetect
import sqlite3
import sys
+from fatcat_tools.transforms import entity_to_dict
# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
CONTAINER_TYPE_MAP = {
@@ -55,16 +56,42 @@ DATACITE_TYPE_MAP = {
'Thesis': 'thesis',
},
'citeproc': {
- 'dataset': 'dataset',
- 'chapter': 'chapter',
- 'article-journal': 'article-journal',
- 'song': 'song',
'article': 'article',
- 'report': 'report',
+ 'article-journal': 'article-journal',
+ 'article-magazine': 'article-magazine',
+ 'article-newspaper': 'article-newspaper',
+ 'bill': 'bill',
+ 'book': 'book',
+ 'broadcast': 'broadcast',
+ 'chapter': 'chapter',
+ 'dataset': 'dataset',
+ 'entry-dictionary': 'entry-dictionary',
+ 'entry-encyclopedia': 'entry-encyclopedia',
+ 'entry': 'entry',
+ 'figure': 'figure',
'graphic': 'graphic',
+ 'interview': 'interview',
+ 'legal_case': 'legal_case',
+ 'legislation': 'legislation',
+ 'manuscript': 'manuscript',
+ 'map': 'map',
+ 'motion_picture': 'motion_picture',
+ 'musical_score': 'musical_score',
+ 'pamphlet': 'pamphlet',
+ 'paper-conference': 'paper-conference',
+ 'patent': 'patent',
+ 'personal_communication': 'personal_communication',
+ 'post': 'post',
+ 'post-weblog': 'post-weblog',
+ 'report': 'report',
+ 'review-book': 'review-book',
+ 'review': 'review',
+ 'song': 'song',
+ 'speech': 'speech',
'thesis': 'thesis',
- 'book': 'book',
- },
+ 'treaty': 'treaty',
+ 'webpage': 'webpage',
+ }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
'bibtex': {
'phdthesis': 'thesis',
'inbook': 'chapter',
@@ -88,7 +115,6 @@ DATACITE_TYPE_MAP = {
}
}
-
# TODO(martin): merge this with other maps, maybe.
LICENSE_SLUG_MAP = {
"//creativecommons.org/licenses/by/2.0/": "CC-BY",
@@ -124,7 +150,8 @@ LICENSE_SLUG_MAP = {
"//www.karger.com/Services/SiteLicenses": "KARGER",
"//www.opensource.org/licenses/Apache-2.0": "Apache-2.0",
"//www.opensource.org/licenses/BSD-3-Clause": "BSD-3-Clause",
- "//www.opensource.org/licenses/EUPL-1.1": "EUPL-1.1", # redirects to EUPL-1.2
+ "//www.opensource.org/licenses/EUPL-1.1":
+ "EUPL-1.1", # redirects to EUPL-1.2
"//www.opensource.org/licenses/MIT": "MIT",
# "http://royalsocietypublishing.org/licence": "", # OA and "normal", https://royalsociety.org/journals/authors/licence-to-publish/
# "http://rsc.li/journals-terms-of-use": "RSC",
@@ -146,23 +173,31 @@ LICENSE_SLUG_MAP = {
# Note: Some URLs pointing to licensing terms are not in WB yet (but would be nice).
}
+
class DataciteImporter(EntityImporter):
"""
Importer for datacite records.
"""
-
- def __init__(self, api, issn_map_file, debug=False, lang_detect=False,
- insert_log_file=None, **kwargs):
-
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of Datacite DOI metadata, harvested from REST API")
+ def __init__(self,
+ api,
+ issn_map_file,
+ debug=False,
+ lang_detect=False,
+ insert_log_file=None,
+ **kwargs):
+
+ eg_desc = kwargs.get(
+ 'editgroup_description',
+ "Automated import of Datacite DOI metadata, harvested from REST API"
+ )
eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DataciteImporter')
+ eg_extra['agent'] = eg_extra.get('agent',
+ 'fatcat_tools.DataciteImporter')
super().__init__(api,
- issn_map_file=issn_map_file,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ issn_map_file=issn_map_file,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
self.create_containers = kwargs.get('create_containers', True)
extid_map_file = kwargs.get('extid_map_file')
@@ -179,18 +214,31 @@ class DataciteImporter(EntityImporter):
self.lang_detect = lang_detect
self.insert_log_file = insert_log_file
- print('datacite with debug={}, lang_detect={}'.format(self.debug, self.lang_detect), file=sys.stderr)
+ print('datacite with debug={}, lang_detect={}'.format(
+ self.debug, self.lang_detect),
+ file=sys.stderr)
def lookup_ext_ids(self, doi):
"""
Return dictionary of identifiers refering to the same things as the given DOI.
"""
if self.extid_map_db is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
- row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
+ return dict(core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None)
+ row = self.extid_map_db.execute(
+ "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
[doi.lower()]).fetchone()
if row is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+ return dict(core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None)
row = [str(cell or '') or None for cell in row]
return dict(
core_id=row[0],
@@ -206,6 +254,8 @@ class DataciteImporter(EntityImporter):
"""
Mapping datacite JSON to ReleaseEntity.
"""
+ if not obj or not isinstance(obj, dict):
+ return None
if 'attributes' not in obj:
return None
@@ -218,43 +268,54 @@ class DataciteImporter(EntityImporter):
contribs = []
for i, c in enumerate(attributes['creators']):
- if 'nameType' in c and not c.get('nameType') == 'Personal':
- continue
- creator_id = None
- for nid in c.get('nameIdentifiers', []):
- if not nid.get('nameIdentifierScheme').lower() == "orcid":
+ nameType = c.get('nameType', '') or ''
+ if nameType == 'Personal':
+ creator_id = None
+ for nid in c.get('nameIdentifiers', []):
+ if not nid.get('nameIdentifierScheme').lower() == "orcid":
+ continue
+ orcid = nid.get('nameIdentifier',
+ '').replace('https://orcid.org/', '')
+ if not orcid:
+ continue
+ creator_id = self.lookup_orcid(orcid)
+ # TODO(martin): If creator_id is None, should we create creators?
+
+ # If there are multiple affiliation strings, use the first one.
+ affiliations = c.get('affiliation', []) or []
+ raw_affiliation = None
+ if len(affiliations) == 0:
+ raw_affiliation = None
+ else:
+ raw_affiliation = affiliations[0]
+
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ creator_id=creator_id,
+ index=i,
+ raw_name=c.get('name'),
+ given_name=c.get('givenName'),
+ surname=c.get('familyName'),
+ role='author',
+ raw_affiliation=raw_affiliation,
+ ))
+ elif nameType == 'Organizational':
+ name = c.get('name', '') or ''
+ if name == 'NN':
continue
- orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '')
- if not orcid:
+ if len(name) < 3:
continue
- creator_id = self.lookup_orcid(orcid)
- # TODO(martin): If creator_id is None, should we create creators?
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- creator_id=creator_id,
- index=i,
- raw_name=c.get('name'),
- given_name=c.get('givenName'),
- surname=c.get('familyName'),
- ))
+ extra = {'organization': name}
+ contribs.append(fatcat_openapi_client.ReleaseContrib(
+ index=i, extra=extra))
+ else:
+ print('unknown name type: {}'.format(nameType), file=sys.stderr)
# Title, may come with "attributes.titles[].titleType", like
# "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
- title, subtitle = None, None
-
titles = attributes.get('titles', []) or []
- if len(titles) == 0:
- print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
- return False
- elif len(titles) == 1:
- # We do not care about the type then.
- title = titles[0].get('title', '') or ''
- title = title.strip()
- else:
- for entry in titles:
- if not title and ('titleType' not in entry or not entry.get('titleType')):
- title = entry.get('title').strip()
- if entry.get('titleType') == 'Subtitle':
- subtitle = entry.get('title', '').strip()
+ title, original_language_title, subtitle = parse_datacite_titles(
+ titles)
if not title:
print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
@@ -268,67 +329,14 @@ class DataciteImporter(EntityImporter):
# "attributes.dates[].dateType", values: "Accepted", "Available"
# "Collected", "Copyrighted", "Created", "Issued", "Submitted",
# "Updated", "Valid".
- release_year, release_date = None, None
-
- # Ignore: Collected, Issued.
- date_type_prio = (
- 'Valid',
- 'Available',
- 'Accepted',
- 'Submitted',
- 'Copyrighted',
- 'Created',
- 'Updated',
- )
-
- # Before using (expensive) dateparser, try a few common patterns.
- common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S', '%Y')
-
- for prio in date_type_prio:
- dates = attributes.get('dates', []) or [] # Never be None.
- for item in dates:
- if not item.get('dateType') == prio:
- continue
-
- # Parse out date, use common patterns first, fallback to dateparser.
- result, value, year_only = None, item.get('date', ''), False
-
- for pattern in common_patterns:
- try:
- result = datetime.datetime.strptime(value, pattern)
- except ValueError:
- continue
- else:
- if pattern == '%Y':
- year_only = True
- break
-
- if result is None:
- print('fallback for {}'.format(value), file=sys.stderr)
- try:
- result = dateparser.parse(value)
- except TypeError as err:
- print("{} date parsing failed with: {}".format(value, err), file=sys.stderr)
- continue
-
- if result is None:
- # Unparsable date.
- continue
- if not year_only:
- release_date = result.date()
- release_year = result.year
- if 1000 < release_year < datetime.date.today().year + 5:
- # Skip possibly bogus dates.
- continue
- break
- else:
- continue
- break
+ release_date, release_year = parse_datacite_dates(
+ attributes.get('dates', []))
# Publisher. A few NA values. A few bogus values.
publisher = attributes.get('publisher')
- if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', '(:none)'):
+ if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)',
+ '(:none)'):
publisher = None
if publisher is not None and len(publisher) > 80:
# Arbitrary magic value max length. TODO(martin): better heuristic,
@@ -345,7 +353,8 @@ class DataciteImporter(EntityImporter):
container = attributes.get('container', {}) or {}
if container.get('type') in CONTAINER_TYPE_MAP.keys():
container_type = CONTAINER_TYPE_MAP.get(container['type'])
- if container.get('identifier') and container.get('identifierType') == 'ISSN':
+ if container.get('identifier') and container.get(
+ 'identifierType') == 'ISSN':
issn = container.get('identifier')
if len(issn) == 8:
issn = issn[:4] + "-" + issn[4:]
@@ -357,7 +366,8 @@ class DataciteImporter(EntityImporter):
container_title = container.get('title')
if isinstance(container_title, list):
if len(container_title) > 0:
- print('too many container titles: {}'.format(len(container_title)))
+ print('too many container titles: {}'.format(
+ len(container_title)))
container_title = container_title[0]
assert isinstance(container_title, str)
ce = fatcat_openapi_client.ContainerEntity(
@@ -404,7 +414,8 @@ class DataciteImporter(EntityImporter):
# types supplied in datacite. The "attributes.types.resourceType"
# contains too many (176 in sample) things for now; citeproc may be the
# closest, but not always supplied.
- for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'):
+ for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg',
+ 'bibtex', 'ris'):
value = attributes.get('types', {}).get(typeType)
release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
if release_type is not None:
@@ -442,19 +453,19 @@ class DataciteImporter(EntityImporter):
if len(desc.get('description', '')) < 10:
continue
text = desc.get('description')
- sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest()
lang = None
if self.lang_detect:
try:
lang = langdetect.detect(text)
except langdetect.lang_detect_exception.LangDetectException as err:
- print('language detection failed: {}'.format(err), file=sys.stderr)
- abstracts.append(fatcat_openapi_client.ReleaseAbstract(
- mimetype="text/plain",
- content=text,
- sha1=sha1,
- lang=lang,
- ))
+ print('language detection failed: {}'.format(err),
+ file=sys.stderr)
+ abstracts.append(
+ fatcat_openapi_client.ReleaseAbstract(
+ mimetype="text/plain",
+ content=text,
+ lang=lang,
+ ))
# References and relations. Datacite include many relation types in
# "attributes.relatedIdentifiers[].relationType", e.g.
@@ -476,17 +487,19 @@ class DataciteImporter(EntityImporter):
ref_extra['doi'] = rel.get('relatedIdentifier')
if not ref_extra:
ref_extra = None
- refs.append(fatcat_openapi_client.ReleaseRef(
- index=ref_index,
- extra=ref_extra,
- ))
+ refs.append(
+ fatcat_openapi_client.ReleaseRef(
+ index=ref_index,
+ extra=ref_extra,
+ ))
ref_index += 1
# Start with clear stages, e.g. published. TODO(martin): we could
# probably infer a bit more from the relations, e.g.
# "IsPreviousVersionOf" or "IsNewVersionOf".
release_stage = None
- if attributes.get('state') == 'findable' or attributes.get('isActive') is True:
+ if attributes.get(
+ 'state') == 'findable' or attributes.get('isActive') is True:
release_stage = 'published'
# Extra information.
@@ -496,8 +509,22 @@ class DataciteImporter(EntityImporter):
extra_datacite['license'] = license_extra
if attributes.get('subjects'):
extra_datacite['subjects'] = attributes['subjects']
- if attributes.get('url'):
- extra_datacite['url'] = attributes['url']
+
+ # Include certain relations from relatedIdentifiers. Keeping the
+ # original structure of data here, which is a list of dicts, with
+ # relation type, identifer and identifier type (mostly).
+ relations = []
+ for rel in relIds:
+ if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues',
+ 'IsVariantFormOf', 'IsSupplementTo',
+ 'HasVersion', 'IsMetadataFor',
+ 'IsNewVersionOf', 'IsIdenticalTo',
+ 'IsVersionOf', 'IsDerivedFrom',
+ 'IsSourceOf'):
+ relations.append(rel)
+
+ if relations:
+ extra_datacite['relations'] = relations
extra = dict()
@@ -515,7 +542,7 @@ class DataciteImporter(EntityImporter):
release_stage=release_stage,
title=title,
subtitle=subtitle,
- original_title=title,
+ original_title=original_language_title,
release_year=release_year,
release_date=release_date,
publisher=publisher,
@@ -546,7 +573,7 @@ class DataciteImporter(EntityImporter):
hide schema mismatch bugs.
"""
if self.debug is True:
- print(json.dumps(re.to_dict(), default=extended_json_encoder))
+ print(json.dumps(entity_to_dict(re, api_client=None)))
return False
# lookup existing DOI (don't need to try other ext idents for crossref)
@@ -572,24 +599,15 @@ class DataciteImporter(EntityImporter):
if self.insert_log_file:
with open(self.insert_log_file, 'a') as f:
for doc in batch:
- json.dump(doc.to_dict(), f, default=extended_json_encoder)
+ json.dump(entity_to_dict(re, api_client=None), f)
f.write('\n')
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
-def extended_json_encoder(value):
- """
- Can be used with json.dumps(value, default=extended_json_encoder) to serialize
- value not serializable by default. https://docs.python.org/3/library/json.html#basic-usage
- """
- if isinstance(value, (datetime.datetime, datetime.date)):
- return value.isoformat()
- if isinstance(value, set):
- return list(value)
- raise TypeError('cannot encode type: {}'.format(type(value)))
def lookup_license_slug(raw):
"""
@@ -604,3 +622,156 @@ def lookup_license_slug(raw):
if not raw.endswith('/'):
raw = raw + '/'
return LICENSE_SLUG_MAP.get(raw)
+
+
+def find_original_language_title(item, min_length=4, max_questionmarks=3):
+ """
+ Perform a few checks before returning a potential original language title.
+ """
+ if not 'original_language_title' in item:
+ return None
+ title = item.get('title')
+ if not title:
+ return None
+ original_language_title = item.get('original_language_title')
+ if isinstance(original_language_title,
+ str) and title != original_language_title:
+ if len(original_language_title) < min_length:
+ return None
+ if original_language_title.count('?') > max_questionmarks:
+ return None
+ return original_language_title
+ if isinstance(original_language_title, dict):
+ content = original_language_title.get('__content__', '') or ''
+ if content and content != title and not content.count(
+ '?') > max_questionmarks:
+ return content
+ return None
+
+
+def parse_datacite_titles(titles):
+ """
+ Given a list of title items from datacite, return 3-tuple (title,
+ original_language_title, subtitle).
+
+ Example input:
+
+ [
+ {
+ "title": "Meeting Heterogeneity in Consumer Demand"
+ }
+ ]
+ """
+ title, original_language_title, subtitle = None, None, None
+
+ if titles is None:
+ return title, original_language_title, subtitle
+ if len(titles) == 0:
+ return title, original_language_title, subtitle
+ elif len(titles) == 1:
+ original_language_title = find_original_language_title(titles[0])
+ title = titles[0].get('title', '') or ''
+ title = title.strip()
+ if not title:
+ title = None
+ return title, original_language_title, subtitle
+ else:
+ for entry in titles:
+ if not title and ('titleType' not in entry
+ or not entry.get('titleType')):
+ title = entry.get('title').strip()
+ if not subtitle and entry.get('titleType') == 'Subtitle':
+ subtitle = entry.get('title', '').strip()
+ if not original_language_title:
+ original_language_title = find_original_language_title(entry)
+
+ return title, original_language_title, subtitle
+
+
+def parse_datacite_dates(dates):
+ """
+ Given a list of date fields (under .dates), return tuple, (release_date,
+ release_year).
+ """
+ release_date, release_year = None, None
+
+ if not dates:
+ return release_date, release_year
+
+ if not isinstance(dates, list):
+ raise ValueError('expected a list of date items')
+
+ # Ignored: Collected, Issued.
+ date_type_prio = (
+ 'Valid',
+ 'Available',
+ 'Accepted',
+ 'Submitted',
+ 'Copyrighted',
+ 'Created',
+ 'Updated',
+ )
+
+ # Before using (expensive) dateparser, try a few common patterns.
+ common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ',
+ '%Y-%m-%dT%H:%M:%S', '%Y')
+
+ def parse_item(item):
+ result, value, year_only = None, item.get('date', ''), False
+ release_date, release_year = None, None
+
+ for pattern in common_patterns:
+ try:
+ result = datetime.datetime.strptime(value, pattern)
+ except ValueError:
+ continue
+ else:
+ if pattern == '%Y':
+ year_only = True
+ break
+
+ if result is None:
+ print('fallback for {}'.format(value), file=sys.stderr)
+ try:
+ result = dateparser.parse(value)
+ except TypeError as err:
+ print("{} date parsing failed with: {}".format(value, err),
+ file=sys.stderr)
+ return result_date, result_year
+
+ if result is None:
+ # Unparsable date.
+ return release_date, release_year
+
+ if not year_only:
+ release_date = result.date()
+ release_year = result.year
+
+ return release_date, release_year
+
+ for prio in date_type_prio:
+ for item in dates:
+ if not item.get('dateType') == prio:
+ continue
+
+ release_date, release_year = parse_item(item)
+ if release_date is None and release_year is None:
+ continue
+
+ if release_year < 1000 or release_year > datetime.date.today(
+ ).year + 5:
+ # Skip possibly bogus dates.
+ release_year = None
+ continue
+ break
+ else:
+ continue
+ break
+
+ if release_date is None and release_year is None:
+ for item in dates:
+ release_date, release_year = parse_item(item)
+ if release_year or release_date:
+ break
+
+ return release_date, release_year