summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/arxiv.py6
-rwxr-xr-xpython/fatcat_tools/importers/cdl_dash_dat.py6
-rw-r--r--python/fatcat_tools/importers/common.py5
-rw-r--r--python/fatcat_tools/importers/crossref.py9
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py9
-rw-r--r--python/fatcat_tools/importers/jstor.py6
6 files changed, 29 insertions, 12 deletions
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index 1d50dd9a..dd2c2284 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -9,6 +9,8 @@ from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseEntity
from pylatexenc.latex2text import LatexNodes2Text
+from fatcat_tools.normal import clean_doi
+
from .common import EntityImporter
from .crossref import lookup_license_slug
@@ -127,8 +129,8 @@ class ArxivRawImporter(EntityImporter):
base_id = metadata.id.string
doi = None
if metadata.doi and metadata.doi.string:
- doi = metadata.doi.string.lower().split()[0].strip()
- if not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]):
+ doi = clean_doi(metadata.doi.string.lower().split()[0].strip())
+ if doi and not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]):
sys.stderr.write("BOGUS DOI: {}\n".format(doi))
doi = None
title = latex_to_text(metadata.title.get_text().replace("\n", " "))
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
index 1a4114a0..ec557e15 100755
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ b/python/fatcat_tools/importers/cdl_dash_dat.py
@@ -23,6 +23,8 @@ from fatcat_openapi_client import (
ReleaseExtIds,
)
+from fatcat_tools.normal import clean_doi
+
from .common import clean
from .crossref import lookup_license_slug
@@ -78,8 +80,8 @@ def cdl_dash_release(
extra = dict()
assert meta["identifier"]["type"] == "DOI"
- doi = meta["identifier"]["value"].lower()
- assert doi.startswith("10.")
+ doi = clean_doi(meta["identifier"]["value"].lower())
+ assert doi and doi.startswith("10.")
ark_id = None
for extid in meta.get("alternativeIdentifiers", []):
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index fd472d11..425b6f13 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -29,7 +29,7 @@ from fuzzycat.matching import match_release_fuzzy
# TODO: refactor so remove need for this (re-imports for backwards compatibility)
from fatcat_tools.normal import is_cjk # noqa: F401
-from fatcat_tools.normal import LANG_MAP_MARC, b32_hex # noqa: F401
+from fatcat_tools.normal import LANG_MAP_MARC, b32_hex, clean_doi # noqa: F401
from fatcat_tools.normal import clean_str as clean # noqa: F401
from fatcat_tools.transforms import entity_to_dict
@@ -342,8 +342,7 @@ class EntityImporter:
return creator_id
def is_doi(self, doi: str) -> bool:
- # TODO: replace with clean_doi() from fatcat_tools.normal
- return doi.startswith("10.") and doi.count("/") >= 1
+ return clean_doi(doi) is not None
def lookup_doi(self, doi: str) -> Optional[str]:
"""Caches calls to the doi lookup API endpoint in a local dict
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 9c69fee3..c9f251fc 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -4,6 +4,8 @@ from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
+from fatcat_tools.normal import clean_doi
+
from .common import EntityImporter, clean
# The docs/guide should be the canonical home for these mappings; update there
@@ -467,6 +469,11 @@ class CrossrefImporter(EntityImporter):
self.counts["skip-blank-title"] += 1
return None
+ doi = clean_doi(obj["DOI"].lower())
+ if not doi:
+ self.counts["skip-bad-doi"] += 1
+ return None
+
subtitle = None
if obj.get("subtitle"):
subtitle = clean(obj["subtitle"][0], force_xml=True)
@@ -489,7 +496,7 @@ class CrossrefImporter(EntityImporter):
release_year=release_year,
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
- doi=obj["DOI"].lower(),
+ doi=doi,
isbn13=isbn13,
),
volume=clean(obj.get("volume")),
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index e36e1b48..7c595787 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -7,6 +7,8 @@ from typing import Any, Dict, List, Optional
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
+from fatcat_tools.normal import clean_doi
+
from .common import EntityImporter, clean, make_rel_url
MAX_ABSTRACT_BYTES = 4096
@@ -133,9 +135,10 @@ class GrobidMetadataImporter(EntityImporter):
# only returns year, ever?
release_year = int(obj["date"][:4])
- extra = dict()
- if obj.get("doi"):
- extra["doi"] = obj["doi"]
+ extra: Dict[str, Any] = dict()
+ doi = clean_doi(obj.get("doi"))
+ if doi:
+ extra["doi"] = doi
if obj["journal"] and obj["journal"].get("name"):
extra["container_name"] = clean(obj["journal"]["name"])
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index 2c8aa0a4..ca1f2466 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -8,6 +8,8 @@ import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseEntity
+from fatcat_tools.normal import clean_doi
+
from .common import LANG_MAP_MARC, EntityImporter, clean
from .crossref import CONTAINER_TYPE_MAP
@@ -146,7 +148,9 @@ class JstorImporter(EntityImporter):
doi = article_meta.find("article-id", {"pub-id-type": "doi"})
if doi:
- doi = doi.string.lower().strip()
+ doi = clean_doi(doi.string.lower())
+ else:
+ doi = None
jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"})
if jstor_id: