aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/crossref.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/crossref.py')
-rw-r--r--python/fatcat_tools/importers/crossref.py413
1 files changed, 246 insertions, 167 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index fd6936a4..606d4bb1 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -1,4 +1,3 @@
-
import datetime
import sqlite3
from typing import Any, Dict, Optional
@@ -13,30 +12,30 @@ from .common import EntityImporter, clean
# Can get a list of Crossref types (with counts) via API:
# https://api.crossref.org/works?rows=0&facet=type-name:*
CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = {
- 'book': 'book',
- 'book-chapter': 'chapter',
- 'book-part': 'chapter',
- 'book-section': 'chapter',
- 'component': 'component',
- 'dataset': 'dataset',
- 'dissertation': 'thesis',
- 'edited-book': 'book',
- 'journal-article': 'article-journal',
- 'monograph': 'book',
- 'other': None,
- 'peer-review': 'peer_review',
- 'posted-content': 'post',
- 'proceedings-article': 'paper-conference',
- 'reference-book': 'book',
- 'reference-entry': 'entry',
- 'report': 'report',
- 'standard': 'standard',
+ "book": "book",
+ "book-chapter": "chapter",
+ "book-part": "chapter",
+ "book-section": "chapter",
+ "component": "component",
+ "dataset": "dataset",
+ "dissertation": "thesis",
+ "edited-book": "book",
+ "journal-article": "article-journal",
+ "monograph": "book",
+ "other": None,
+ "peer-review": "peer_review",
+ "posted-content": "post",
+ "proceedings-article": "paper-conference",
+ "reference-book": "book",
+ "reference-entry": "entry",
+ "report": "report",
+ "standard": "standard",
}
CONTAINER_TYPE_MAP: Dict[str, str] = {
- 'article-journal': 'journal',
- 'paper-conference': 'conference',
- 'book': 'book-series',
+ "article-journal": "journal",
+ "paper-conference": "conference",
+ "book": "book-series",
}
# These are based, informally, on sorting the most popular licenses found in
@@ -90,29 +89,41 @@ LICENSE_SLUG_MAP: Dict[str, str] = {
"//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
}
+
def lookup_license_slug(raw: str) -> Optional[str]:
if not raw:
return None
- raw = raw.strip().replace('http://', '//').replace('https://', '//')
- if 'creativecommons.org' in raw.lower():
+ raw = raw.strip().replace("http://", "//").replace("https://", "//")
+ if "creativecommons.org" in raw.lower():
raw = raw.lower()
- raw = raw.replace('/legalcode', '/').replace('/uk', '')
- if not raw.endswith('/'):
- raw = raw + '/'
+ raw = raw.replace("/legalcode", "/").replace("/uk", "")
+ if not raw.endswith("/"):
+ raw = raw + "/"
return LICENSE_SLUG_MAP.get(raw)
+
def test_lookup_license_slug():
assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
- assert lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") == "CC-BY"
- assert lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") == "CC-0"
+ assert (
+ lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
+ == "CC-BY"
+ )
+ assert (
+ lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
+ == "CC-0"
+ )
assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
- assert lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") == "CC-BY-NC-SA"
+ assert (
+ lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
+ == "CC-BY-NC-SA"
+ )
assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
assert lookup_license_slug("") is None
assert lookup_license_slug(None) is None
+
class CrossrefImporter(EntityImporter):
"""
Importer for Crossref metadata.
@@ -124,18 +135,22 @@ class CrossrefImporter(EntityImporter):
def __init__(self, api, issn_map_file, **kwargs):
- eg_desc: Optional[str] = kwargs.get('editgroup_description',
- "Automated import of Crossref DOI metadata, harvested from REST API")
- eg_extra: Optional[dict] = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.CrossrefImporter')
- super().__init__(api,
+ eg_desc: Optional[str] = kwargs.get(
+ "editgroup_description",
+ "Automated import of Crossref DOI metadata, harvested from REST API",
+ )
+ eg_extra: Optional[dict] = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.CrossrefImporter")
+ super().__init__(
+ api,
issn_map_file=issn_map_file,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
- **kwargs)
+ **kwargs
+ )
- self.create_containers: bool = kwargs.get('create_containers', True)
- extid_map_file = kwargs.get('extid_map_file')
+ self.create_containers: bool = kwargs.get("create_containers", True)
+ extid_map_file = kwargs.get("extid_map_file")
self.extid_map_db: Optional[Any] = None
if extid_map_file:
db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -148,12 +163,27 @@ class CrossrefImporter(EntityImporter):
def lookup_ext_ids(self, doi: str) -> Optional[Any]:
if self.extid_map_db is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
- row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
- [doi.lower()]).fetchone()
+ return dict(
+ core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None,
+ )
+ row = self.extid_map_db.execute(
+ "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
+ ).fetchone()
if row is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
- row = [str(cell or '') or None for cell in row]
+ return dict(
+ core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None,
+ )
+ row = [str(cell or "") or None for cell in row]
return dict(
core_id=row[0],
pmid=row[1],
@@ -173,17 +203,17 @@ class CrossrefImporter(EntityImporter):
return CONTAINER_TYPE_MAP.get(crossref_type)
def want(self, obj: Dict[str, Any]) -> bool:
- if not obj.get('title'):
- self.counts['skip-blank-title'] += 1
+ if not obj.get("title"):
+ self.counts["skip-blank-title"] += 1
return False
# these are pre-registered DOIs before the actual record is ready
# title is a list of titles
- titles = obj.get('title')
+ titles = obj.get("title")
if titles is not None and titles[0].strip().lower() in [
- "OUP accepted manuscript".lower(),
- ]:
- self.counts['skip-stub-title'] += 1
+ "OUP accepted manuscript".lower(),
+ ]:
+ self.counts["skip-stub-title"] += 1
return False
# do most of these checks in-line below
@@ -197,86 +227,105 @@ class CrossrefImporter(EntityImporter):
# Ways to be out of scope (provisionally)
# journal-issue and journal-volume map to None, but allowed for now
- if obj.get('type') in (None, 'journal', 'proceedings',
- 'standard-series', 'report-series', 'book-series', 'book-set',
- 'book-track', 'proceedings-series'):
- self.counts['skip-release-type'] += 1
+ if obj.get("type") in (
+ None,
+ "journal",
+ "proceedings",
+ "standard-series",
+ "report-series",
+ "book-series",
+ "book-set",
+ "book-track",
+ "proceedings-series",
+ ):
+ self.counts["skip-release-type"] += 1
return None
# Do require the 'title' keys to exist, as release entities do
- if ('title' not in obj) or (not obj['title']):
- self.counts['skip-blank-title'] += 1
+ if ("title" not in obj) or (not obj["title"]):
+ self.counts["skip-blank-title"] += 1
return None
- release_type = self.map_release_type(obj['type'])
+ release_type = self.map_release_type(obj["type"])
# contribs
def do_contribs(obj_list, ctype):
contribs = []
for i, am in enumerate(obj_list):
creator_id = None
- if 'ORCID' in am.keys():
- creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1])
+ if "ORCID" in am.keys():
+ creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1])
# Sorry humans :(
- if am.get('given') and am.get('family'):
- raw_name = "{} {}".format(am['given'], am['family'])
- elif am.get('family'):
- raw_name = am['family']
+ if am.get("given") and am.get("family"):
+ raw_name = "{} {}".format(am["given"], am["family"])
+ elif am.get("family"):
+ raw_name = am["family"]
else:
# TODO: can end up empty
- raw_name = am.get('name') or am.get('given')
+ raw_name = am.get("name") or am.get("given")
extra = dict()
if ctype == "author":
index = i
else:
index = None
raw_affiliation = None
- if am.get('affiliation'):
- if len(am.get('affiliation')) > 0:
- raw_affiliation = am.get('affiliation')[0]['name']
- if len(am.get('affiliation')) > 1:
+ if am.get("affiliation"):
+ if len(am.get("affiliation")) > 0:
+ raw_affiliation = am.get("affiliation")[0]["name"]
+ if len(am.get("affiliation")) > 1:
# note: affiliation => more_affiliations
- extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]
- if am.get('sequence') and am.get('sequence') != "additional":
- extra['seq'] = clean(am.get('sequence'))
+ extra["more_affiliations"] = [
+ clean(a["name"]) for a in am.get("affiliation")[1:]
+ ]
+ if am.get("sequence") and am.get("sequence") != "additional":
+ extra["seq"] = clean(am.get("sequence"))
if not extra:
extra = None
assert ctype in ("author", "editor", "translator")
raw_name = clean(raw_name)
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- creator_id=creator_id,
- index=index,
- raw_name=raw_name,
- given_name=clean(am.get('given')),
- surname=clean(am.get('family')),
- raw_affiliation=clean(raw_affiliation),
- role=ctype,
- extra=extra))
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ creator_id=creator_id,
+ index=index,
+ raw_name=raw_name,
+ given_name=clean(am.get("given")),
+ surname=clean(am.get("family")),
+ raw_affiliation=clean(raw_affiliation),
+ role=ctype,
+ extra=extra,
+ )
+ )
return contribs
- contribs = do_contribs(obj.get('author', []), "author")
- contribs.extend(do_contribs(obj.get('editor', []), "editor"))
- contribs.extend(do_contribs(obj.get('translator', []), "translator"))
+
+ contribs = do_contribs(obj.get("author", []), "author")
+ contribs.extend(do_contribs(obj.get("editor", []), "editor"))
+ contribs.extend(do_contribs(obj.get("translator", []), "translator"))
# container
- issn = obj.get('ISSN', [None])[0]
+ issn = obj.get("ISSN", [None])[0]
issnl = self.issn2issnl(issn)
container_id = None
if issnl:
container_id = self.lookup_issnl(issnl)
- publisher = clean(obj.get('publisher'))
+ publisher = clean(obj.get("publisher"))
- container_name = obj.get('container-title')
+ container_name = obj.get("container-title")
if container_name:
container_name = clean(container_name[0], force_xml=True)
if not container_name:
container_name = None
- if (container_id is None and self.create_containers and (issnl is not None)
- and container_name):
+ if (
+ container_id is None
+ and self.create_containers
+ and (issnl is not None)
+ and container_name
+ ):
ce = fatcat_openapi_client.ContainerEntity(
issnl=issnl,
publisher=publisher,
container_type=self.map_container_type(release_type),
- name=container_name)
+ name=container_name,
+ )
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
self._issnl_id_map[issnl] = container_id
@@ -284,21 +333,21 @@ class CrossrefImporter(EntityImporter):
# license slug
license_slug = None
license_extra = []
- for lic in obj.get('license', []):
- if lic['content-version'] not in ('vor', 'unspecified'):
+ for lic in obj.get("license", []):
+ if lic["content-version"] not in ("vor", "unspecified"):
continue
- slug = lookup_license_slug(lic['URL'])
+ slug = lookup_license_slug(lic["URL"])
if slug:
license_slug = slug
- if 'start' in lic:
- lic['start'] = lic['start']['date-time']
+ if "start" in lic:
+ lic["start"] = lic["start"]["date-time"]
license_extra.append(lic)
# references
refs = []
- for i, rm in enumerate(obj.get('reference', [])):
+ for i, rm in enumerate(obj.get("reference", [])):
try:
- year: Optional[int] = int(rm.get('year'))
+ year: Optional[int] = int(rm.get("year"))
# TODO: will need to update/config in the future!
# NOTE: are there crossref works with year < 100?
if year is not None:
@@ -307,56 +356,78 @@ class CrossrefImporter(EntityImporter):
except (TypeError, ValueError):
year = None
ref_extra: Dict[str, Any] = dict()
- key = rm.get('key')
- if key and key.startswith(obj['DOI'].upper()):
- key = key.replace(obj['DOI'].upper() + "-", '')
- key = key.replace(obj['DOI'].upper(), '')
- ref_container_name = rm.get('volume-title')
+ key = rm.get("key")
+ if key and key.startswith(obj["DOI"].upper()):
+ key = key.replace(obj["DOI"].upper() + "-", "")
+ key = key.replace(obj["DOI"].upper(), "")
+ ref_container_name = rm.get("volume-title")
if not ref_container_name:
- ref_container_name = rm.get('journal-title')
- elif rm.get('journal-title'):
- ref_extra['journal-title'] = rm['journal-title']
- if rm.get('DOI'):
- ref_extra['doi'] = rm.get('DOI').lower()
- author = clean(rm.get('author'))
+ ref_container_name = rm.get("journal-title")
+ elif rm.get("journal-title"):
+ ref_extra["journal-title"] = rm["journal-title"]
+ if rm.get("DOI"):
+ ref_extra["doi"] = rm.get("DOI").lower()
+ author = clean(rm.get("author"))
if author:
- ref_extra['authors'] = [author]
- for k in ('editor', 'edition', 'authority', 'version', 'genre',
- 'url', 'event', 'issue', 'volume', 'date', 'accessed_date',
- 'issued', 'page', 'medium', 'collection_title', 'chapter_number',
- 'unstructured', 'series-title', 'volume-title'):
+ ref_extra["authors"] = [author]
+ for k in (
+ "editor",
+ "edition",
+ "authority",
+ "version",
+ "genre",
+ "url",
+ "event",
+ "issue",
+ "volume",
+ "date",
+ "accessed_date",
+ "issued",
+ "page",
+ "medium",
+ "collection_title",
+ "chapter_number",
+ "unstructured",
+ "series-title",
+ "volume-title",
+ ):
if clean(rm.get(k)):
ref_extra[k] = clean(rm[k])
if not ref_extra:
ref_extra = None
- refs.append(fatcat_openapi_client.ReleaseRef(
- index=i,
- # doing lookups would be a second import pass
- target_release_id=None,
- key=key,
- year=year,
- container_name=clean(ref_container_name),
- title=clean(rm.get('article-title')),
- locator=clean(rm.get('first-page')),
- # TODO: just dump JSON somewhere here?
- extra=ref_extra))
+ refs.append(
+ fatcat_openapi_client.ReleaseRef(
+ index=i,
+ # doing lookups would be a second import pass
+ target_release_id=None,
+ key=key,
+ year=year,
+ container_name=clean(ref_container_name),
+ title=clean(rm.get("article-title")),
+ locator=clean(rm.get("first-page")),
+ # TODO: just dump JSON somewhere here?
+ extra=ref_extra,
+ )
+ )
# abstracts
abstracts = []
- abstract = clean(obj.get('abstract'))
+ abstract = clean(obj.get("abstract"))
if abstract and len(abstract) > 10:
- abstracts.append(fatcat_openapi_client.ReleaseAbstract(
- mimetype="application/xml+jats",
- content=abstract))
+ abstracts.append(
+ fatcat_openapi_client.ReleaseAbstract(
+ mimetype="application/xml+jats", content=abstract
+ )
+ )
# extra fields
extra = dict()
extra_crossref = dict()
# top-level extra keys
if not container_id:
- if obj.get('container-title'):
- extra['container_name'] = container_name
- for key in ('group-title'):
+ if obj.get("container-title"):
+ extra["container_name"] = container_name
+ for key in "group-title":
val = obj.get(key)
if val:
if type(val) == list:
@@ -368,7 +439,7 @@ class CrossrefImporter(EntityImporter):
else:
extra[key] = val
# crossref-nested extra keys
- for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'):
+ for key in ("subject", "type", "alternative-id", "archive", "funder"):
val = obj.get(key)
if val:
if type(val) == str:
@@ -376,46 +447,51 @@ class CrossrefImporter(EntityImporter):
else:
extra_crossref[key] = val
if license_extra:
- extra_crossref['license'] = license_extra
+ extra_crossref["license"] = license_extra
- if len(obj['title']) > 1:
- aliases = [clean(t) for t in obj['title'][1:]]
+ if len(obj["title"]) > 1:
+ aliases = [clean(t) for t in obj["title"][1:]]
aliases = [t for t in aliases if t]
if aliases:
- extra['aliases'] = aliases
+ extra["aliases"] = aliases
# ISBN
isbn13 = None
- for raw in obj.get('ISBN', []):
+ for raw in obj.get("ISBN", []):
# TODO: convert if not ISBN-13 format
if len(raw) == 17:
isbn13 = raw
break
# release status
- if obj['type'] in ('journal-article', 'conference-proceeding', 'book',
- 'dissertation', 'book-chapter'):
+ if obj["type"] in (
+ "journal-article",
+ "conference-proceeding",
+ "book",
+ "dissertation",
+ "book-chapter",
+ ):
release_stage = "published"
else:
# unknown
release_stage = None
# external identifiers
- extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj['DOI'].lower())
+ extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower())
# filter out unreasonably huge releases
if len(abstracts) > 100:
- self.counts['skip-huge-abstracts'] += 1
+ self.counts["skip-huge-abstracts"] += 1
return None
if len(contribs) > 2000:
- self.counts['skip-huge-contribs'] += 1
+ self.counts["skip-huge-contribs"] += 1
return None
if len(refs) > 5000:
- self.counts['skip-huge-refs'] += 1
+ self.counts["skip-huge-refs"] += 1
return None
# release date parsing is amazingly complex
- raw_date = obj['issued']['date-parts'][0]
+ raw_date = obj["issued"]["date-parts"][0]
if not raw_date or not raw_date[0]:
# got some NoneType, even though at least year is supposed to be set
release_year = None
@@ -429,28 +505,28 @@ class CrossrefImporter(EntityImporter):
release_date = None
original_title: Optional[str] = None
- if obj.get('original-title'):
- ot = obj.get('original-title')
+ if obj.get("original-title"):
+ ot = obj.get("original-title")
if ot is not None:
original_title = clean(ot[0], force_xml=True)
title: Optional[str] = None
- if obj.get('title'):
- title = clean(obj.get('title')[0], force_xml=True)
+ if obj.get("title"):
+ title = clean(obj.get("title")[0], force_xml=True)
if not title or len(title) <= 1:
# title can't be just a single character
- self.counts['skip-blank-title'] += 1
+ self.counts["skip-blank-title"] += 1
return None
subtitle = None
- if obj.get('subtitle'):
- subtitle = clean(obj.get('subtitle')[0], force_xml=True)
+ if obj.get("subtitle"):
+ subtitle = clean(obj.get("subtitle")[0], force_xml=True)
if not subtitle or len(subtitle) <= 1:
# subtitle can't be just a single character
subtitle = None
if extra_crossref:
- extra['crossref'] = extra_crossref
+ extra["crossref"] = extra_crossref
if not extra:
extra = None
@@ -466,19 +542,19 @@ class CrossrefImporter(EntityImporter):
release_year=release_year,
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
- doi=obj['DOI'].lower(),
- pmid=extids['pmid'],
- pmcid=extids['pmcid'],
- wikidata_qid=extids['wikidata_qid'],
+ doi=obj["DOI"].lower(),
+ pmid=extids["pmid"],
+ pmcid=extids["pmcid"],
+ wikidata_qid=extids["wikidata_qid"],
isbn13=isbn13,
- core=extids['core_id'],
- arxiv=extids['arxiv_id'],
- jstor=extids['jstor_id'],
+ core=extids["core_id"],
+ arxiv=extids["arxiv_id"],
+ jstor=extids["jstor_id"],
),
- volume=clean(obj.get('volume')),
- issue=clean(obj.get('issue')),
- pages=clean(obj.get('page')),
- language=clean(obj.get('language')),
+ volume=clean(obj.get("volume")),
+ issue=clean(obj.get("issue")),
+ pages=clean(obj.get("page")),
+ language=clean(obj.get("language")),
license_slug=license_slug,
extra=extra,
abstracts=abstracts,
@@ -500,14 +576,17 @@ class CrossrefImporter(EntityImporter):
# eventually we'll want to support "updates", but for now just skip if
# entity already exists
if existing:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
return True
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )