From 1024e688bb12d64648ceb638daf049d508f87561 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 18:13:15 -0800 Subject: importers: use clean_doi() in many more (all?) importers --- python/fatcat_tools/importers/arxiv.py | 6 ++++-- python/fatcat_tools/importers/cdl_dash_dat.py | 6 ++++-- python/fatcat_tools/importers/common.py | 5 ++--- python/fatcat_tools/importers/crossref.py | 9 ++++++++- python/fatcat_tools/importers/grobid_metadata.py | 9 ++++++--- python/fatcat_tools/importers/jstor.py | 6 +++++- 6 files changed, 29 insertions(+), 12 deletions(-) diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 1d50dd9a..dd2c2284 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -9,6 +9,8 @@ from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseEntity from pylatexenc.latex2text import LatexNodes2Text +from fatcat_tools.normal import clean_doi + from .common import EntityImporter from .crossref import lookup_license_slug @@ -127,8 +129,8 @@ class ArxivRawImporter(EntityImporter): base_id = metadata.id.string doi = None if metadata.doi and metadata.doi.string: - doi = metadata.doi.string.lower().split()[0].strip() - if not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]): + doi = clean_doi(metadata.doi.string.lower().split()[0].strip()) + if doi and not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]): sys.stderr.write("BOGUS DOI: {}\n".format(doi)) doi = None title = latex_to_text(metadata.title.get_text().replace("\n", " ")) diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py index 1a4114a0..ec557e15 100755 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -23,6 +23,8 @@ from fatcat_openapi_client import ( ReleaseExtIds, ) +from fatcat_tools.normal import clean_doi + from .common import clean from .crossref import lookup_license_slug @@ -78,8 +80,8 @@ def cdl_dash_release( extra = dict() assert meta["identifier"]["type"] == "DOI" - doi = meta["identifier"]["value"].lower() - assert doi.startswith("10.") + doi = clean_doi(meta["identifier"]["value"].lower()) + assert doi and doi.startswith("10.") ark_id = None for extid in meta.get("alternativeIdentifiers", []): diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index fd472d11..425b6f13 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -29,7 +29,7 @@ from fuzzycat.matching import match_release_fuzzy # TODO: refactor so remove need for this (re-imports for backwards compatibility) from fatcat_tools.normal import is_cjk # noqa: F401 -from fatcat_tools.normal import LANG_MAP_MARC, b32_hex # noqa: F401 +from fatcat_tools.normal import LANG_MAP_MARC, b32_hex, clean_doi # noqa: F401 from fatcat_tools.normal import clean_str as clean # noqa: F401 from fatcat_tools.transforms import entity_to_dict @@ -342,8 +342,7 @@ class EntityImporter: return creator_id def is_doi(self, doi: str) -> bool: - # TODO: replace with clean_doi() from fatcat_tools.normal - return doi.startswith("10.") and doi.count("/") >= 1 + return clean_doi(doi) is not None def lookup_doi(self, doi: str) -> Optional[str]: """Caches calls to the doi lookup API endpoint in a local dict diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 9c69fee3..c9f251fc 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -4,6 +4,8 @@ from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity +from fatcat_tools.normal import clean_doi + from .common import EntityImporter, clean # The docs/guide should be the canonical home for these mappings; update there @@ -467,6 +469,11 @@ class CrossrefImporter(EntityImporter): self.counts["skip-blank-title"] += 1 return None + doi = clean_doi(obj["DOI"].lower()) + if not doi: + self.counts["skip-bad-doi"] += 1 + return None + subtitle = None if obj.get("subtitle"): subtitle = clean(obj["subtitle"][0], force_xml=True) @@ -489,7 +496,7 @@ class CrossrefImporter(EntityImporter): release_year=release_year, publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( - doi=obj["DOI"].lower(), + doi=doi, isbn13=isbn13, ), volume=clean(obj.get("volume")), diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index e36e1b48..7c595787 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -7,6 +7,8 @@ from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity +from fatcat_tools.normal import clean_doi + from .common import EntityImporter, clean, make_rel_url MAX_ABSTRACT_BYTES = 4096 @@ -133,9 +135,10 @@ class GrobidMetadataImporter(EntityImporter): # only returns year, ever? release_year = int(obj["date"][:4]) - extra = dict() - if obj.get("doi"): - extra["doi"] = obj["doi"] + extra: Dict[str, Any] = dict() + doi = clean_doi(obj.get("doi")) + if doi: + extra["doi"] = doi if obj["journal"] and obj["journal"].get("name"): extra["container_name"] = clean(obj["journal"]["name"]) diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 2c8aa0a4..ca1f2466 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -8,6 +8,8 @@ import fatcat_openapi_client from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseEntity +from fatcat_tools.normal import clean_doi + from .common import LANG_MAP_MARC, EntityImporter, clean from .crossref import CONTAINER_TYPE_MAP @@ -146,7 +148,9 @@ class JstorImporter(EntityImporter): doi = article_meta.find("article-id", {"pub-id-type": "doi"}) if doi: - doi = doi.string.lower().strip() + doi = clean_doi(doi.string.lower()) + else: + doi = None jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"}) if jstor_id: -- cgit v1.2.3