summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/crossref.py26
-rw-r--r--python/fatcat_tools/importers/dblp_release.py21
-rw-r--r--python/fatcat_tools/importers/doaj_article.py2
3 files changed, 27 insertions, 22 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 606d4bb1..d0017002 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -393,8 +393,6 @@ class CrossrefImporter(EntityImporter):
):
if clean(rm.get(k)):
ref_extra[k] = clean(rm[k])
- if not ref_extra:
- ref_extra = None
refs.append(
fatcat_openapi_client.ReleaseRef(
index=i,
@@ -406,7 +404,7 @@ class CrossrefImporter(EntityImporter):
title=clean(rm.get("article-title")),
locator=clean(rm.get("first-page")),
# TODO: just dump JSON somewhere here?
- extra=ref_extra,
+ extra=ref_extra or None,
)
)
@@ -421,8 +419,8 @@ class CrossrefImporter(EntityImporter):
)
# extra fields
- extra = dict()
- extra_crossref = dict()
+ extra: Dict[str, Any] = dict()
+ extra_crossref: Dict[str, Any] = dict()
# top-level extra keys
if not container_id:
if obj.get("container-title"):
@@ -471,13 +469,13 @@ class CrossrefImporter(EntityImporter):
"dissertation",
"book-chapter",
):
- release_stage = "published"
+ release_stage: Optional[str] = "published"
else:
# unknown
release_stage = None
# external identifiers
- extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower())
+ extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {}
# filter out unreasonably huge releases
if len(abstracts) > 100:
@@ -512,7 +510,7 @@ class CrossrefImporter(EntityImporter):
title: Optional[str] = None
if obj.get("title"):
- title = clean(obj.get("title")[0], force_xml=True)
+ title = clean(obj["title"][0], force_xml=True)
if not title or len(title) <= 1:
# title can't be just a single character
self.counts["skip-blank-title"] += 1
@@ -520,15 +518,13 @@ class CrossrefImporter(EntityImporter):
subtitle = None
if obj.get("subtitle"):
- subtitle = clean(obj.get("subtitle")[0], force_xml=True)
+ subtitle = clean(obj["subtitle"][0], force_xml=True)
if not subtitle or len(subtitle) <= 1:
# subtitle can't be just a single character
subtitle = None
if extra_crossref:
extra["crossref"] = extra_crossref
- if not extra:
- extra = None
re = ReleaseEntity(
work_id=None,
@@ -556,10 +552,10 @@ class CrossrefImporter(EntityImporter):
pages=clean(obj.get("page")),
language=clean(obj.get("language")),
license_slug=license_slug,
- extra=extra,
- abstracts=abstracts,
- contribs=contribs,
- refs=refs,
+ extra=extra or None,
+ abstracts=abstracts or None,
+ contribs=contribs or None,
+ refs=refs or None,
)
return re
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index 5baa6cd6..e73e5f33 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -26,6 +26,7 @@ import sys # noqa: F401
import warnings
from typing import Any, List, Optional
+import bs4
import fatcat_openapi_client
from fatcat_tools.importers.common import EntityImporter
@@ -420,7 +421,9 @@ class DblpReleaseImporter(EntityImporter):
)
)
- def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
+ def dblp_contribs(
+ self, elem: bs4.element.Tag
+ ) -> List[fatcat_openapi_client.ReleaseContrib]:
"""
- author (multiple; each a single string)
=> may have HTML entities
@@ -431,21 +434,23 @@ class DblpReleaseImporter(EntityImporter):
"""
contribs = []
index = 0
- for elem in authors.find_all("author"):
+ for elem in elem.find_all("author"):
contrib = self.dblp_contrib_single(elem)
contrib.role = "author"
contrib.index = index
contribs.append(contrib)
index += 1
- for elem in authors.find_all("editor"):
+ for elem in elem.find_all("editor"):
contrib = self.dblp_contrib_single(elem)
contrib.role = "editor"
contribs.append(contrib)
return contribs
- def dblp_contrib_single(self, elem: Any) -> fatcat_openapi_client.ReleaseContrib:
+ def dblp_contrib_single(
+ self, elem: bs4.element.Tag
+ ) -> fatcat_openapi_client.ReleaseContrib:
"""
In the future, might try to implement creator key-ificiation and lookup here.
@@ -461,11 +466,15 @@ class DblpReleaseImporter(EntityImporter):
raw_name = clean_str(elem.text)
# remove number in author name, if present
- if raw_name.split()[-1].isdigit():
+ if raw_name and raw_name.split()[-1].isdigit():
raw_name = " ".join(raw_name.split()[:-1])
if elem.get("orcid"):
- orcid = clean_orcid(elem["orcid"])
+ orcid_val = elem["orcid"]
+ if isinstance(orcid_val, list):
+ orcid = clean_orcid(orcid_val[0])
+ else:
+ orcid = clean_orcid(orcid_val)
if orcid:
creator_id = self.lookup_orcid(orcid)
if not creator_id:
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index cd063337..56045ea7 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -382,7 +382,7 @@ class DoajArticleImporter(EntityImporter):
if not license.get("open_access"):
continue
slug = license.get("type")
- if slug.startswith("CC "):
+ if slug and slug.startswith("CC "):
slug = slug.replace("CC ", "cc-").lower()
return slug
return None