diff options
| author | bnewbold <bnewbold@archive.org> | 2021-11-11 01:12:18 +0000 | 
|---|---|---|
| committer | bnewbold <bnewbold@archive.org> | 2021-11-11 01:12:18 +0000 | 
| commit | 6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch) | |
| tree | 1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_tools/importers | |
| parent | 7e3f91f1a49ea85707cae31125021ba761f5373d (diff) | |
| parent | 6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff) | |
| download | fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip | |
Merge branch 'bnewbold-import-refactors' into 'master'
import refactors and deprecations
Some of these are from old stale branches (the datacite subject metadata patch), but most are from yesterday and today. Sort of a hodge-podge, but the general theme is getting around to deferred cleanups and refactors specific to importer code before making some behavioral changes.
The Datacite-specific stuff could use review here.
Remove unused/deprecated/dead code:
- cdl_dash_dat and wayback_static importers, which were for specific early example entities and have been superseded by other importers
- "extid map" sqlite3 feature from several importers, was only used for initial bulk imports (and maybe should not have been used)
Refactors:
- moved a number of large datastructures out of importer code and into a dedicated static file (`biblio_lookup_tables.py`). Didn't move all, just the ones that were either generic or very large (making it hard to read code)
- shuffled around relative imports and some function names ("clean_str" vs. "clean")
Some actual behavioral changes:
- remove some Datacite-specific license slugs
- stop trying to fix double-slashes in DOIs, that was causing more harm than help (some DOIs do actually have double-slashes!)
- remove some excess metadata from datacite 'extra' fields
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 8 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/arabesque.py | 4 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/arxiv.py | 6 | ||||
| -rwxr-xr-x | python/fatcat_tools/importers/cdl_dash_dat.py | 219 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/chocula.py | 8 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 69 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/crossref.py | 209 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/datacite.py | 237 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/doaj_article.py | 5 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 39 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/jalc.py | 74 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/journal_metadata.py | 8 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/jstor.py | 19 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/orcid.py | 10 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 324 | ||||
| -rwxr-xr-x | python/fatcat_tools/importers/wayback_static.py | 287 | 
16 files changed, 146 insertions, 1380 deletions
| diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 06ecfd58..654be2e9 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -13,10 +13,8 @@ To run an import you combine two classes; one each of:  from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter  from .arxiv import ArxivRawImporter -from .cdl_dash_dat import auto_cdl_dash_dat  from .chocula import ChoculaImporter  from .common import ( -    LANG_MAP_MARC,      Bs4XmlFileListPusher,      Bs4XmlFilePusher,      Bs4XmlLargeFilePusher, @@ -28,11 +26,8 @@ from .common import (      KafkaJsonPusher,      LinePusher,      SqlitePusher, -    clean, -    is_cjk, -    make_kafka_consumer,  ) -from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug +from .crossref import CrossrefImporter  from .datacite import DataciteImporter  from .dblp_container import DblpContainerImporter  from .dblp_release import DblpReleaseImporter @@ -55,4 +50,3 @@ from .matched import MatchedImporter  from .orcid import OrcidImporter  from .pubmed import PubmedImporter  from .shadow import ShadowLibraryImporter -from .wayback_static import auto_wayback_static diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index b4a4d9ed..92289bb3 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional  import fatcat_openapi_client  from fatcat_openapi_client import ApiClient, FileEntity -from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url +from fatcat_tools.normal import b32_hex + +from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, make_rel_url  ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL" diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 1d50dd9a..dd2c2284 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -9,6 +9,8 @@ from bs4 import BeautifulSoup  from fatcat_openapi_client import ApiClient, ReleaseEntity  from pylatexenc.latex2text import LatexNodes2Text +from fatcat_tools.normal import clean_doi +  from .common import EntityImporter  from .crossref import lookup_license_slug @@ -127,8 +129,8 @@ class ArxivRawImporter(EntityImporter):          base_id = metadata.id.string          doi = None          if metadata.doi and metadata.doi.string: -            doi = metadata.doi.string.lower().split()[0].strip() -            if not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]): +            doi = clean_doi(metadata.doi.string.lower().split()[0].strip()) +            if doi and not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]):                  sys.stderr.write("BOGUS DOI: {}\n".format(doi))                  doi = None          title = latex_to_text(metadata.title.get_text().replace("\n", " ")) diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py deleted file mode 100755 index 1a4114a0..00000000 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python3 - -import hashlib -import json -import mimetypes -import os -import subprocess -import sys -import urllib -import urllib.parse -from typing import Any, Dict, List, Optional, Tuple - -import fatcat_openapi_client -import magic -from fatcat_openapi_client import ( -    ApiClient, -    Editgroup, -    FilesetEntity, -    FilesetFile, -    ReleaseAbstract, -    ReleaseContrib, -    ReleaseEntity, -    ReleaseExtIds, -) - -from .common import clean -from .crossref import lookup_license_slug - - -def single_file(prefix: str, path: str) -> FilesetFile: - -    full = prefix + path -    size_bytes = os.stat(full).st_size - -    hashes = [ -        hashlib.md5(), -        hashlib.sha1(), -        hashlib.sha256(), -    ] -    with open(full, "rb") as fp: -        while True: -            data = fp.read(2 ** 20) -            if not data: -                break -            for h in hashes: -                h.update(data) -    mime = magic.Magic(mime=True).from_file(full) -    if mime == "application/octet-stream": -        # magic apparently isn't that great; try using filename as well -        guess = mimetypes.guess_type(full)[0] -        if guess: -            mime = guess - -    fsf = FilesetFile( -        path=path, -        size=size_bytes, -        md5=hashes[0].hexdigest(), -        sha1=hashes[1].hexdigest(), -        sha256=hashes[2].hexdigest(), -        extra=dict(mimetype=mime), -    ) -    return fsf - - -def make_manifest(base_dir: str) -> List[FilesetFile]: -    manifest = [] -    for root, dirs, files in os.walk(base_dir): -        for f in files: -            manifest.append(single_file(root, f)) -    return manifest - - -def cdl_dash_release( -    meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None -) -> ReleaseEntity: - -    if not extra: -        extra = dict() - -    assert meta["identifier"]["type"] == "DOI" -    doi = meta["identifier"]["value"].lower() -    assert doi.startswith("10.") - -    ark_id = None -    for extid in meta.get("alternativeIdentifiers", []): -        if extid["value"].startswith("ark:"): -            ark_id = extid["value"] -    assert ark_id - -    license_slug = lookup_license_slug(meta["rights"]["uri"]) - -    abstracts = [] -    for desc in meta["descriptions"]: -        if desc["type"] == "abstract": -            abstracts.append( -                ReleaseAbstract(mimetype="text/html", content=clean(desc["value"])) -            ) -            # print(abstracts) - -    contribs = [] -    for creator in meta["creator"]: -        contribs.append( -            ReleaseContrib( -                given_name=creator["given"], -                surname=creator["family"], -                # sorry everybody -                raw_name="{} {}".format(creator["given"], creator["family"]), -                raw_affiliation=creator.get("affiliation"), -                role="author",  # presumably, for these datasets? -            ) -        ) - -    r = ReleaseEntity( -        ext_ids=ReleaseExtIds( -            doi=doi, -            ark=ark_id, -        ), -        title=clean(meta["title"], force_xml=True), -        publisher=clean(meta["publisher"]), -        release_year=int(meta["publicationYear"]), -        release_type="dataset", -        license_slug=license_slug, -        contribs=contribs, -        abstracts=abstracts or None, -        extra=extra, -    ) -    return r - - -def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]: - -    if dat_path.endswith("/"): -        dat_path = dat_path[:-1] -    dat_discovery = dat_path -    extra = dict() -    assert len(dat_discovery) == 64 - -    with open(dat_path + "/cdl_dash_metadata.json", "r") as fp: -        meta_dict = json.loads(fp.read()) - -    release = cdl_dash_release(meta_dict) -    ark_id = release.extra["ark_id"] - -    dash_version = None -    # really crude XML parse-out -    with open(dat_path + "/stash-wrapper.xml", "r") as fp: -        for line in fp: -            line = line.strip() -            if line.startswith("<st:version_number>"): -                dash_version = int(line[19:].split("<")[0]) -    assert dash_version is not None -    extra["cdl_dash"] = dict(version=dash_version) -    release.extra["cdl_dash"] = dict(version=dash_version) - -    manifest = make_manifest(dat_path + "/files/") - -    bundle_url = dict( -        url="https://merritt.cdlib.org/u/{}/{}".format( -            urllib.parse.quote(ark_id, safe=""), dash_version -        ), -        rel="repo-bundle", -    ) -    repo_url = dict( -        url="https://merritt.cdlib.org/d/{}/{}/".format( -            urllib.parse.quote(ark_id, safe=""), dash_version -        ), -        rel="repo", -    ) -    dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb") -    fs = FilesetEntity( -        urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra -    ) -    return (release, fs) - - -def auto_cdl_dash_dat( -    api: ApiClient, -    dat_path: str, -    release_id: Optional[str] = None, -    editgroup_id: Optional[str] = None, -) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]: - -    git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") - -    (release, fileset) = make_release_fileset(dat_path) - -    if not editgroup_id: -        eg = api.create_editgroup( -            Editgroup( -                description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", -                extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"), -            ) -        ) -        editgroup_id = eg.editgroup_id - -    if not release_id and release.ext_ids.doi: -        try: -            r = api.lookup_release(doi=release.ext_ids.doi) -            release_id = r.ident -        except fatcat_openapi_client.rest.ApiException: -            pass -    if not release_id: -        edit = api.create_release(eg.editgroup_id, release) -        release_id = edit.ident - -    release = api.get_release(release_id, expand="filesets") -    if len(release.filesets): -        print("A fileset already exists for release {}".format(release.ident)) -        return (None, None, None) - -    fileset.release_ids = [release.ident] -    edit = api.create_fileset(eg.editgroup_id, fileset) -    fileset = api.get_fileset(edit.ident) -    return (editgroup_id, release, fileset) - - -if __name__ == "__main__": -    # pass this a discovery key that has been cloned to the local directory -    print(make_release_fileset(sys.argv[1])) diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 842c7853..c44fec3b 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional  import fatcat_openapi_client  from fatcat_openapi_client import ApiClient, ContainerEntity -from .common import EntityImporter, clean +from fatcat_tools.normal import clean_str + +from .common import EntityImporter  class ChoculaImporter(EntityImporter): @@ -40,7 +42,7 @@ class ChoculaImporter(EntityImporter):          returns a ContainerEntity (or None if invalid or couldn't parse)          """ -        name = clean(row.get("name")) +        name = clean_str(row.get("name"))          if not name:              # Name is required (by schema)              return None @@ -85,7 +87,7 @@ class ChoculaImporter(EntityImporter):              ident=row["ident"],              name=name,              container_type=container_type, -            publisher=clean(row.get("publisher")), +            publisher=clean_str(row.get("publisher")),              wikidata_qid=row.get("wikidata_qid"),              extra=extra,          ) diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 2ec6efda..e2157ee5 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -27,74 +27,14 @@ from fatcat_openapi_client import (  from fatcat_openapi_client.rest import ApiException  from fuzzycat.matching import match_release_fuzzy -# TODO: refactor so remove need for this (re-imports for backwards compatibility) -from fatcat_tools.normal import is_cjk  # noqa: F401 -from fatcat_tools.normal import LANG_MAP_MARC, b32_hex  # noqa: F401 -from fatcat_tools.normal import clean_str as clean  # noqa: F401 +from fatcat_tools.biblio_lookup_tables import DOMAIN_REL_MAP +from fatcat_tools.normal import clean_doi  from fatcat_tools.transforms import entity_to_dict  DATE_FMT: str = "%Y-%m-%d"  SANE_MAX_RELEASES: int = 200  SANE_MAX_URLS: int = 100 - -DOMAIN_REL_MAP: Dict[str, str] = { -    "archive.org": "archive", -    # LOCKSS, Portico, DuraSpace, etc would also be "archive" -    "arxiv.org": "repository", -    "babel.hathitrust.org": "repository", -    "cds.cern.ch": "repository", -    "deepblue.lib.umich.edu": "repository", -    "europepmc.org": "repository", -    "hal.inria.fr": "repository", -    "scielo.isciii.es": "repository", -    "www.dtic.mil": "repository", -    "www.jstage.jst.go.jp": "repository", -    "www.jstor.org": "repository", -    "www.ncbi.nlm.nih.gov": "repository", -    "ftp.ncbi.nlm.nih.gov": "repository", -    "www.scielo.br": "repository", -    "www.scielo.cl": "repository", -    "www.scielo.org.mx": "repository", -    "zenodo.org": "repository", -    "www.biorxiv.org": "repository", -    "www.medrxiv.org": "repository", -    "citeseerx.ist.psu.edu": "aggregator", -    "publisher-connector.core.ac.uk": "aggregator", -    "core.ac.uk": "aggregator", -    "static.aminer.org": "aggregator", -    "aminer.org": "aggregator", -    "pdfs.semanticscholar.org": "aggregator", -    "semanticscholar.org": "aggregator", -    "www.semanticscholar.org": "aggregator", -    "academic.oup.com": "publisher", -    "cdn.elifesciences.org": "publisher", -    "cell.com": "publisher", -    "dl.acm.org": "publisher", -    "downloads.hindawi.com": "publisher", -    "elifesciences.org": "publisher", -    "iopscience.iop.org": "publisher", -    "journals.plos.org": "publisher", -    "link.springer.com": "publisher", -    "onlinelibrary.wiley.com": "publisher", -    "works.bepress.com": "publisher", -    "www.biomedcentral.com": "publisher", -    "www.cell.com": "publisher", -    "www.nature.com": "publisher", -    "www.pnas.org": "publisher", -    "www.tandfonline.com": "publisher", -    "www.frontiersin.org": "publisher", -    "www.degruyter.com": "publisher", -    "www.mdpi.com": "publisher", -    "www.ahajournals.org": "publisher", -    "ehp.niehs.nih.gov": "publisher", -    "journals.tsu.ru": "publisher", -    "www.cogentoa.com": "publisher", -    "www.researchgate.net": "academicsocial", -    "academia.edu": "academicsocial", -    "wayback.archive-it.org": "webarchive", -    "web.archive.org": "webarchive", -    "archive.is": "webarchive", -} +MAX_ABSTRACT_LENGTH: int = 2048  def make_rel_url(raw_url: str, default_link_rel: str = "web") -> Tuple[str, str]: @@ -342,8 +282,7 @@ class EntityImporter:          return creator_id      def is_doi(self, doi: str) -> bool: -        # TODO: replace with clean_doi() from fatcat_tools.normal -        return doi.startswith("10.") and doi.count("/") >= 1 +        return clean_doi(doi) is not None      def lookup_doi(self, doi: str) -> Optional[str]:          """Caches calls to the doi lookup API endpoint in a local dict diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index a41e2bf5..52bd7465 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,11 +1,13 @@  import datetime -import sqlite3  from typing import Any, Dict, List, Optional, Sequence  import fatcat_openapi_client  from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity -from .common import EntityImporter, clean +from fatcat_tools.biblio_lookup_tables import CONTAINER_TYPE_MAP +from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug + +from .common import EntityImporter  # The docs/guide should be the canonical home for these mappings; update there  # first @@ -32,104 +34,11 @@ CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = {      "standard": "standard",  } -CONTAINER_TYPE_MAP: Dict[str, str] = { -    "article-journal": "journal", -    "paper-conference": "conference", -    "book": "book-series", -} - -# These are based, informally, on sorting the most popular licenses found in -# Crossref metadata. There were over 500 unique strings and only a few most -# popular are here; many were variants of the CC URLs. Would be useful to -# normalize CC licenses better. -# The current norm is to only add license slugs that are at least partially OA. -LICENSE_SLUG_MAP: Dict[str, str] = { -    "//creativecommons.org/publicdomain/mark/1.0": "CC-0", -    "//creativecommons.org/publicdomain/mark/1.0/": "CC-0", -    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", -    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", -    "//creativecommons.org/publicdomain/zero/1.0/": "CC-0", -    "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0", -    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0", -    "//creativecommons.org/licenses/by/2.0/": "CC-BY", -    "//creativecommons.org/licenses/by/3.0/": "CC-BY", -    "//creativecommons.org/licenses/by/4.0/": "CC-BY", -    "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", -    "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", -    "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", -    "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", -    "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", -    "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", -    "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", -    "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", -    "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", -    "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", -    "//spdx.org/licenses/CC0-1.0.json": "CC-0", -    "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY", -    "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY", -    "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC", -    "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA", -    "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA", -    "//spdx.org/licenses/MIT.json": "MIT", -    "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada", -    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", -    "//www.karger.com/Services/SiteLicenses": "KARGER", -    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE", -    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY", -    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER", -    "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", -    "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA", -    "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC", -    # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license -    # //www.springer.com/tdm doesn't seem like a license -    # //iopscience.iop.org/page/copyright is closed -    # //www.acm.org/publications/policies/copyright_policy#Background is closed -    # //rsc.li/journals-terms-of-use is closed for vor (am open) -    # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!) -    "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", -} - - -def lookup_license_slug(raw: Optional[str]) -> Optional[str]: -    if not raw: -        return None -    raw = raw.strip().replace("http://", "//").replace("https://", "//") -    if "creativecommons.org" in raw.lower(): -        raw = raw.lower() -        raw = raw.replace("/legalcode", "/").replace("/uk", "") -        if not raw.endswith("/"): -            raw = raw + "/" -    return LICENSE_SLUG_MAP.get(raw) - - -def test_lookup_license_slug() -> None: - -    assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC" -    assert ( -        lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") -        == "CC-BY" -    ) -    assert ( -        lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") -        == "CC-0" -    ) -    assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY" -    assert ( -        lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") -        == "CC-BY-NC-SA" -    ) -    assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC" -    assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None -    assert lookup_license_slug("") is None -    assert lookup_license_slug(None) is None -  class CrossrefImporter(EntityImporter):      """      Importer for Crossref metadata. -    Can use a local sqlite3 file for faster "external identifier" lookups -      See https://github.com/CrossRef/rest-api-doc for JSON schema notes      """ @@ -150,50 +59,8 @@ class CrossrefImporter(EntityImporter):          )          self.create_containers: bool = kwargs.get("create_containers", True) -        extid_map_file = kwargs.get("extid_map_file") -        self.extid_map_db: Optional[Any] = None -        if extid_map_file: -            db_uri = "file:{}?mode=ro".format(extid_map_file) -            print("Using external ID map: {}".format(db_uri)) -            self.extid_map_db = sqlite3.connect(db_uri, uri=True) -        else: -            print("Not using external ID map") -          self.read_issn_map_file(issn_map_file) -    def lookup_ext_ids(self, doi: str) -> Optional[Any]: -        if self.extid_map_db is None: -            return dict( -                core_id=None, -                pmid=None, -                pmcid=None, -                wikidata_qid=None, -                arxiv_id=None, -                jstor_id=None, -            ) -        row = self.extid_map_db.execute( -            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] -        ).fetchone() -        if row is None: -            return dict( -                core_id=None, -                pmid=None, -                pmcid=None, -                wikidata_qid=None, -                arxiv_id=None, -                jstor_id=None, -            ) -        row = [str(cell or "") or None for cell in row] -        return dict( -            core_id=row[0], -            pmid=row[1], -            pmcid=row[2], -            wikidata_qid=row[3], -            # TODO: -            arxiv_id=None, -            jstor_id=None, -        ) -      def map_release_type(self, crossref_type: str) -> Optional[str]:          return CROSSREF_TYPE_MAP.get(crossref_type) @@ -275,21 +142,21 @@ class CrossrefImporter(EntityImporter):                      if len(affiliation_list) > 1:                          # note: affiliation => more_affiliations                          extra["more_affiliations"] = [ -                            clean(a["name"]) for a in affiliation_list[1:] +                            clean_str(a["name"]) for a in affiliation_list[1:]                          ]                  if am.get("sequence") and am.get("sequence") != "additional": -                    extra["seq"] = clean(am.get("sequence")) +                    extra["seq"] = clean_str(am.get("sequence"))                  assert ctype in ("author", "editor", "translator") -                raw_name = clean(raw_name) +                raw_name = clean_str(raw_name)                  # TODO: what if 'raw_name' is None?                  contribs.append(                      ReleaseContrib(                          creator_id=creator_id,                          index=index,                          raw_name=raw_name, -                        given_name=clean(am.get("given")), -                        surname=clean(am.get("family")), -                        raw_affiliation=clean(raw_affiliation), +                        given_name=clean_str(am.get("given")), +                        surname=clean_str(am.get("family")), +                        raw_affiliation=clean_str(raw_affiliation),                          role=ctype,                          extra=extra or None,                      ) @@ -306,11 +173,11 @@ class CrossrefImporter(EntityImporter):          container_id = None          if issnl:              container_id = self.lookup_issnl(issnl) -        publisher = clean(obj.get("publisher")) +        publisher = clean_str(obj.get("publisher"))          container_name = obj.get("container-title")          if container_name: -            container_name = clean(container_name[0], force_xml=True) +            container_name = clean_str(container_name[0], force_xml=True)          if not container_name:              container_name = None          if ( @@ -366,7 +233,7 @@ class CrossrefImporter(EntityImporter):                  ref_extra["journal-title"] = rm["journal-title"]              if rm.get("DOI"):                  ref_extra["doi"] = rm.get("DOI").lower() -            author = clean(rm.get("author")) +            author = clean_str(rm.get("author"))              if author:                  ref_extra["authors"] = [author]              for k in ( @@ -390,8 +257,8 @@ class CrossrefImporter(EntityImporter):                  "series-title",                  "volume-title",              ): -                if clean(rm.get(k)): -                    ref_extra[k] = clean(rm[k]) +                if clean_str(rm.get(k)): +                    ref_extra[k] = clean_str(rm[k])              refs.append(                  fatcat_openapi_client.ReleaseRef(                      index=i, @@ -399,9 +266,9 @@ class CrossrefImporter(EntityImporter):                      target_release_id=None,                      key=key,                      year=year, -                    container_name=clean(ref_container_name), -                    title=clean(rm.get("article-title")), -                    locator=clean(rm.get("first-page")), +                    container_name=clean_str(ref_container_name), +                    title=clean_str(rm.get("article-title")), +                    locator=clean_str(rm.get("first-page")),                      # TODO: just dump JSON somewhere here?                      extra=ref_extra or None,                  ) @@ -409,7 +276,7 @@ class CrossrefImporter(EntityImporter):          # abstracts          abstracts = [] -        abstract = clean(obj.get("abstract")) +        abstract = clean_str(obj.get("abstract"))          if abstract and len(abstract) > 10:              abstracts.append(                  fatcat_openapi_client.ReleaseAbstract( @@ -430,9 +297,9 @@ class CrossrefImporter(EntityImporter):                  if type(val) == list:                      val = val[0]                  if type(val) == str: -                    val = clean(val) +                    val = clean_str(val)                      if val: -                        extra[key] = clean(val) +                        extra[key] = clean_str(val)                  else:                      extra[key] = val          # crossref-nested extra keys @@ -440,14 +307,14 @@ class CrossrefImporter(EntityImporter):              val = obj.get(key)              if val:                  if type(val) == str: -                    extra_crossref[key] = clean(val) +                    extra_crossref[key] = clean_str(val)                  else:                      extra_crossref[key] = val          if license_extra:              extra_crossref["license"] = license_extra          if len(obj["title"]) > 1: -            aliases = [clean(t) for t in obj["title"][1:]] +            aliases = [clean_str(t) for t in obj["title"][1:]]              aliases = [t for t in aliases if t]              if aliases:                  extra["aliases"] = aliases @@ -473,9 +340,6 @@ class CrossrefImporter(EntityImporter):              # unknown              release_stage = None -        # external identifiers -        extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {} -          # filter out unreasonably huge releases          if len(abstracts) > 100:              self.counts["skip-huge-abstracts"] += 1 @@ -505,19 +369,24 @@ class CrossrefImporter(EntityImporter):          if obj.get("original-title"):              ot = obj.get("original-title")              if ot is not None: -                original_title = clean(ot[0], force_xml=True) +                original_title = clean_str(ot[0], force_xml=True)          title: Optional[str] = None          if obj.get("title"): -            title = clean(obj["title"][0], force_xml=True) +            title = clean_str(obj["title"][0], force_xml=True)              if not title or len(title) <= 1:                  # title can't be just a single character                  self.counts["skip-blank-title"] += 1                  return None +        doi = clean_doi(obj["DOI"].lower()) +        if not doi: +            self.counts["skip-bad-doi"] += 1 +            return None +          subtitle = None          if obj.get("subtitle"): -            subtitle = clean(obj["subtitle"][0], force_xml=True) +            subtitle = clean_str(obj["subtitle"][0], force_xml=True)              if not subtitle or len(subtitle) <= 1:                  # subtitle can't be just a single character                  subtitle = None @@ -537,19 +406,13 @@ class CrossrefImporter(EntityImporter):              release_year=release_year,              publisher=publisher,              ext_ids=fatcat_openapi_client.ReleaseExtIds( -                doi=obj["DOI"].lower(), -                pmid=extids["pmid"], -                pmcid=extids["pmcid"], -                wikidata_qid=extids["wikidata_qid"], +                doi=doi,                  isbn13=isbn13, -                core=extids["core_id"], -                arxiv=extids["arxiv_id"], -                jstor=extids["jstor_id"],              ), -            volume=clean(obj.get("volume")), -            issue=clean(obj.get("issue")), -            pages=clean(obj.get("page")), -            language=clean(obj.get("language")), +            volume=clean_str(obj.get("volume")), +            issue=clean_str(obj.get("issue")), +            pages=clean_str(obj.get("page")), +            language=clean_str(obj.get("language")),              license_slug=license_slug,              extra=extra or None,              abstracts=abstracts or None, diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index d5622960..b310f8bc 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -12,7 +12,6 @@ import collections  import datetime  import json  import re -import sqlite3  import sys  from typing import Any, Dict, List, Optional, Sequence, Set, Tuple @@ -22,113 +21,19 @@ import langdetect  import pycountry  from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity -from fatcat_tools.normal import clean_doi +from fatcat_tools.biblio_lookup_tables import DATACITE_TYPE_MAP +from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug  from fatcat_tools.transforms import entity_to_dict -from .common import EntityImporter, clean - -# Cutoff length for abstracts. -MAX_ABSTRACT_LENGTH = 2048 +from .common import MAX_ABSTRACT_LENGTH, EntityImporter  # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary -CONTAINER_TYPE_MAP: Dict[str, str] = { +DATACITE_CONTAINER_TYPE_MAP: Dict[str, str] = {      "Journal": "journal",      "Series": "journal",      "Book Series": "book-series",  } -# The docs/guide should be the canonical home for these mappings; update there -# first.  Map various datacite type types to CSL-ish types. None means TODO or -# remove. -DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = { -    "ris": { -        "THES": "thesis", -        "SOUND": "song",  # 99.9% maps to citeproc song, so use that (exception: report) -        "CHAP": "chapter", -        "FIGURE": "figure", -        "RPRT": "report", -        "JOUR": "article-journal", -        "MPCT": "motion_picture", -        "GEN": "article-journal",  # GEN consist of 99% article and report, post-weblog, misc - and one dataset -        "BOOK": "book", -        "DATA": "dataset", -        "COMP": "software", -    }, -    "schemaOrg": { -        "Dataset": "dataset", -        "Book": "book", -        "ScholarlyArticle": "article-journal", -        "ImageObject": "graphic", -        "Collection": None, -        "MediaObject": None, -        "Event": None, -        "SoftwareSourceCode": "software", -        "Chapter": "chapter", -        "CreativeWork": None,  # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. -        "PublicationIssue": "article", -        "AudioObject": None, -        "Thesis": "thesis", -    }, -    "citeproc": { -        "article": "article", -        "article-journal": "article-journal", -        "article-magazine": "article-magazine", -        "article-newspaper": "article-newspaper", -        "bill": "bill", -        "book": "book", -        "broadcast": "broadcast", -        "chapter": "chapter", -        "dataset": "dataset", -        "entry-dictionary": "entry-dictionary", -        "entry-encyclopedia": "entry-encyclopedia", -        "entry": "entry", -        "figure": "figure", -        "graphic": "graphic", -        "interview": "interview", -        "legal_case": "legal_case", -        "legislation": "legislation", -        "manuscript": "manuscript", -        "map": "map", -        "motion_picture": "motion_picture", -        "musical_score": "musical_score", -        "pamphlet": "pamphlet", -        "paper-conference": "paper-conference", -        "patent": "patent", -        "personal_communication": "personal_communication", -        "post": "post", -        "post-weblog": "post-weblog", -        "report": "report", -        "review-book": "review-book", -        "review": "review", -        "song": "song", -        "speech": "speech", -        "thesis": "thesis", -        "treaty": "treaty", -        "webpage": "webpage", -    },  # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types -    "bibtex": { -        "phdthesis": "thesis", -        "inbook": "chapter", -        "misc": None, -        "article": "article-journal", -        "book": "book", -    }, -    "resourceTypeGeneral": { -        "Image": "graphic", -        "Dataset": "dataset", -        "PhysicalObject": None, -        "Collection": None, -        "Text": None,  # "Greyliterature, labnotes, accompanyingmaterials" -        "Sound": None, -        "InteractiveResource": None, -        "Event": None, -        "Software": "software", -        "Other": None, -        "Workflow": None, -        "Audiovisual": None, -    },  # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 -} -  # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.  DATACITE_UNKNOWN_MARKERS: List[str] = [      "(:unac)",  # temporarily inaccessible @@ -181,43 +86,6 @@ DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [      }  ] -# TODO(martin): merge this with other maps and lookup functions, eventually. -LICENSE_SLUG_MAP: Dict[str, str] = { -    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK", -    "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK", -    "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", -    "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1", -    "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET", -    "//onlinelibrary.wiley.com/termsandconditions/": "WILEY", -    "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN", -    "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY", -    "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE", -    "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC", -    "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA", -    "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER", -    "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", -    "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0", -    "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3", -    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2", -    "//www.karger.com/Services/SiteLicenses/": "KARGER", -    "//www.springer.com/tdm/": "SPRINGER-TDM", -    "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM", -    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", -    "//creativecommons.org/publicdomain/mark/1.0": "CC-0", -    "//creativecommons.org/publicdomain/mark/1.0": "CC-0", -    "//creativecommons.org/publicdomain/mark/1.0/": "CC-0", -    "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", -    "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0", -    "//spdx.org/licenses/CC0-1.0.json": "CC-0", -    "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY", -    "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY", -    "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC", -    "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA", -    "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA", -    "//spdx.org/licenses/MIT.json": "MIT", -    "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada", -} -  class DataciteImporter(EntityImporter):      """ @@ -248,15 +116,6 @@ class DataciteImporter(EntityImporter):          )          self.create_containers = kwargs.get("create_containers", True) -        extid_map_file = kwargs.get("extid_map_file") -        self.extid_map_db = None -        if extid_map_file: -            db_uri = "file:{}?mode=ro".format(extid_map_file) -            print("Using external ID map: {}".format(db_uri), file=sys.stderr) -            self.extid_map_db = sqlite3.connect(db_uri, uri=True) -        else: -            print("Not using external ID map", file=sys.stderr) -          self.read_issn_map_file(issn_map_file)          self.debug = debug          self.insert_log_file = insert_log_file @@ -264,42 +123,6 @@ class DataciteImporter(EntityImporter):          print("datacite with debug={}".format(self.debug), file=sys.stderr) -    def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: -        """ -        Return dictionary of identifiers referring to the same things as the given DOI. -        """ -        if self.extid_map_db is None: -            return dict( -                core_id=None, -                pmid=None, -                pmcid=None, -                wikidata_qid=None, -                arxiv_id=None, -                jstor_id=None, -            ) -        row = self.extid_map_db.execute( -            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] -        ).fetchone() -        if row is None: -            return dict( -                core_id=None, -                pmid=None, -                pmcid=None, -                wikidata_qid=None, -                arxiv_id=None, -                jstor_id=None, -            ) -        row = [str(cell or "") or None for cell in row] -        return dict( -            core_id=row[0], -            pmid=row[1], -            pmcid=row[2], -            wikidata_qid=row[3], -            # TODO: -            arxiv_id=None, -            jstor_id=None, -        ) -      def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]:          """          Mapping datacite JSON to ReleaseEntity. @@ -368,7 +191,7 @@ class DataciteImporter(EntityImporter):              print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)              return False -        title = clean(title) +        title = clean_str(title)          if not title:              print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)              return False @@ -387,7 +210,7 @@ class DataciteImporter(EntityImporter):          if not subtitle:              subtitle = None          else: -            subtitle = clean(subtitle) +            subtitle = clean_str(subtitle)          # Dates. A few internal dates (registered, created, updated) and          # published (0..2554). We try to work with typed date list, in @@ -445,15 +268,15 @@ class DataciteImporter(EntityImporter):              publisher = None          if publisher: -            publisher = clean(publisher) +            publisher = clean_str(publisher)          # Container. For the moment, only ISSN as container.          container_id = None          container_name = None          container = attributes.get("container", {}) or {} -        if container.get("type") in CONTAINER_TYPE_MAP.keys(): -            container_type = CONTAINER_TYPE_MAP.get(container["type"]) +        if container.get("type") in DATACITE_CONTAINER_TYPE_MAP.keys(): +            container_type = DATACITE_CONTAINER_TYPE_MAP.get(container["type"])              if container.get("identifier") and container.get("identifierType") == "ISSN":                  issn = container.get("identifier")                  if issn and len(issn) == 8: @@ -506,10 +329,10 @@ class DataciteImporter(EntityImporter):          issue = container.get("issue")          if volume: -            volume = clean(volume) +            volume = clean_str(volume)          if issue: -            issue = clean(issue) +            issue = clean_str(issue)          # Pages.          pages = None @@ -534,7 +357,7 @@ class DataciteImporter(EntityImporter):          license_extra = []          for lic in attributes.get("rightsList", []): -            slug = lookup_license_slug(lic.get("rightsUri")) +            slug = datacite_lookup_license_slug(lic.get("rightsUri"))              if slug:                  license_slug = slug              license_extra.append(lic) @@ -594,7 +417,7 @@ class DataciteImporter(EntityImporter):                      "[{}] language detection failed with {} on {}".format(doi, err, text),                      file=sys.stderr,                  ) -            abstract_text = clean(text) +            abstract_text = clean_str(text)              if not abstract_text:                  continue              abstracts.append( @@ -643,7 +466,13 @@ class DataciteImporter(EntityImporter):          if license_extra:              extra_datacite["license"] = license_extra          if attributes.get("subjects"): -            extra_datacite["subjects"] = attributes["subjects"] +            # these subjects with schemeUri are too much metadata, which +            # doesn't compress. filter them out. +            extra_subjects = [ +                subj for subj in attributes["subjects"] if not subj.get("schemeUri") +            ] +            if extra_subjects: +                extra_datacite["subjects"] = extra_subjects          # Include version information.          metadata_version = attributes.get("metadataVersion") or "" @@ -706,8 +535,6 @@ class DataciteImporter(EntityImporter):          if release_month:              extra["release_month"] = release_month -        extids = self.lookup_ext_ids(doi=doi) -          # Assemble release.          re = fatcat_openapi_client.ReleaseEntity(              work_id=None, @@ -722,12 +549,6 @@ class DataciteImporter(EntityImporter):              publisher=publisher,              ext_ids=fatcat_openapi_client.ReleaseExtIds(                  doi=doi, -                pmid=extids["pmid"], -                pmcid=extids["pmcid"], -                wikidata_qid=extids["wikidata_qid"], -                core=extids["core_id"], -                arxiv=extids["arxiv_id"], -                jstor=extids["jstor_id"],              ),              contribs=contribs,              volume=volume, @@ -922,14 +743,14 @@ class DataciteImporter(EntityImporter):                  if len(affiliations) == 0:                      raw_affiliation = None                  else: -                    raw_affiliation = clean(affiliations[0]) +                    raw_affiliation = clean_str(affiliations[0])                  name = c.get("name")                  given_name = c.get("givenName")                  surname = c.get("familyName")                  if name: -                    name = clean(name) +                    name = clean_str(name)                  if not any((name, given_name, surname)):                      continue                  if not name: @@ -943,8 +764,8 @@ class DataciteImporter(EntityImporter):                      name = index_form_to_display_name(name)                  if given_name: -                    given_name = clean(given_name) -                surname = clean(surname) +                    given_name = clean_str(given_name) +                surname = clean_str(surname)                  # Perform a final assertion that name does not reduce to zero                  # (e.g. whitespace only name). @@ -1016,7 +837,7 @@ def contributor_list_contains_contributor(      return False -def lookup_license_slug(raw: Optional[str]) -> Optional[str]: +def datacite_lookup_license_slug(raw: Optional[str]) -> Optional[str]:      """      Resolve a variety of strings into a some pseudo-canonical form, e.g.      CC-BY-ND, CC-0, MIT and so on. @@ -1111,12 +932,8 @@ def lookup_license_slug(raw: Optional[str]) -> Optional[str]:              return None          return "RS-{}".format(name.upper()) -    # Fallback to mapped values. -    raw = raw.lower() -    raw = raw.strip().replace("http://", "//").replace("https://", "//") -    if not raw.endswith("/"): -        raw = raw + "/" -    return LICENSE_SLUG_MAP.get(raw) +    # Fallback to generic license lookup +    return lookup_license_slug(raw)  def find_original_language_title( diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index f5c886a2..92dbe574 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Sequence  import fatcat_openapi_client  from fatcat_openapi_client import ApiClient, ReleaseEntity -from fatcat_tools.importers.common import EntityImporter +from fatcat_tools.importers.common import MAX_ABSTRACT_LENGTH, EntityImporter  from fatcat_tools.normal import (      clean_doi,      clean_orcid, @@ -24,9 +24,6 @@ from fatcat_tools.normal import (      parse_month,  ) -# Cutoff length for abstracts. -MAX_ABSTRACT_LENGTH = 2048 -  class DoajArticleImporter(EntityImporter):      def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None: diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index e36e1b48..3c85132c 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -7,9 +7,9 @@ from typing import Any, Dict, List, Optional  import fatcat_openapi_client  from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity -from .common import EntityImporter, clean, make_rel_url +from fatcat_tools.normal import clean_doi, clean_str -MAX_ABSTRACT_BYTES = 4096 +from .common import MAX_ABSTRACT_LENGTH, EntityImporter, make_rel_url  class GrobidMetadataImporter(EntityImporter): @@ -82,9 +82,9 @@ class GrobidMetadataImporter(EntityImporter):          extra_grobid: Dict[str, Any] = dict()          abstract = obj.get("abstract") -        if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10: +        if abstract and len(abstract) < MAX_ABSTRACT_LENGTH and len(abstract) > 10:              abobj = fatcat_openapi_client.ReleaseAbstract( -                mimetype="text/plain", content=clean(obj.get("abstract")) +                mimetype="text/plain", content=clean_str(obj.get("abstract"))              )              abstracts = [abobj]          else: @@ -95,9 +95,9 @@ class GrobidMetadataImporter(EntityImporter):              contribs.append(                  fatcat_openapi_client.ReleaseContrib(                      index=i, -                    raw_name=clean(a["name"]), -                    given_name=clean(a.get("given_name")), -                    surname=clean(a.get("surname")), +                    raw_name=clean_str(a["name"]), +                    given_name=clean_str(a.get("given_name")), +                    surname=clean_str(a.get("surname")),                      role="author",                      extra=None,                  ) @@ -114,15 +114,15 @@ class GrobidMetadataImporter(EntityImporter):                      pass              for key in ("volume", "url", "issue", "publisher"):                  if raw.get(key): -                    cite_extra[key] = clean(raw[key]) +                    cite_extra[key] = clean_str(raw[key])              if raw.get("authors"): -                cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]] +                cite_extra["authors"] = [clean_str(a["name"]) for a in raw["authors"]]              refs.append(                  fatcat_openapi_client.ReleaseRef( -                    key=clean(raw.get("id")), +                    key=clean_str(raw.get("id")),                      year=year, -                    title=clean(raw["title"]), +                    title=clean_str(raw["title"]),                      extra=cite_extra or None,                  )              ) @@ -133,11 +133,12 @@ class GrobidMetadataImporter(EntityImporter):              # only returns year, ever?              release_year = int(obj["date"][:4]) -        extra = dict() -        if obj.get("doi"): -            extra["doi"] = obj["doi"] +        extra: Dict[str, Any] = dict() +        doi = clean_doi(obj.get("doi")) +        if doi: +            extra["doi"] = doi          if obj["journal"] and obj["journal"].get("name"): -            extra["container_name"] = clean(obj["journal"]["name"]) +            extra["container_name"] = clean_str(obj["journal"]["name"])          # TODO: ISSN/eISSN handling? or just journal name lookup? @@ -146,7 +147,7 @@ class GrobidMetadataImporter(EntityImporter):          if self.longtail_oa:              extra["longtail_oa"] = True -        clean_title = clean(obj["title"], force_xml=True) +        clean_title = clean_str(obj["title"], force_xml=True)          if not clean_title or len(clean_title) < 2:              return None          title = clean_title @@ -158,9 +159,9 @@ class GrobidMetadataImporter(EntityImporter):              release_year=release_year,              contribs=contribs,              refs=refs, -            publisher=clean(obj["journal"].get("publisher")), -            volume=clean(obj["journal"].get("volume")), -            issue=clean(obj["journal"].get("issue")), +            publisher=clean_str(obj["journal"].get("publisher")), +            volume=clean_str(obj["journal"].get("volume")), +            issue=clean_str(obj["journal"].get("issue")),              abstracts=abstracts or None,              ext_ids=fatcat_openapi_client.ReleaseExtIds(),              extra=extra or None, diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 2f10e533..9916a55f 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,5 +1,4 @@  import datetime -import sqlite3  import sys  from typing import Any, Dict, List, Optional, Sequence @@ -7,9 +6,9 @@ import fatcat_openapi_client  from bs4 import BeautifulSoup  from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity -from fatcat_tools.normal import clean_doi +from fatcat_tools.normal import clean_doi, clean_str, is_cjk -from .common import DATE_FMT, EntityImporter, clean, is_cjk +from .common import DATE_FMT, EntityImporter  # TODO: should be List[Tag] not List[Any] for full type annotations @@ -37,13 +36,13 @@ def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]:      for raw in raw_persons:          name = raw.find("name") or None          if name: -            name = clean(name.get_text().replace("\n", " ")) +            name = clean_str(name.get_text().replace("\n", " "))          surname = raw.find("familyName") or None          if surname: -            surname = clean(surname.get_text().replace("\n", " ")) +            surname = clean_str(surname.get_text().replace("\n", " "))          given_name = raw.find("givenName") or None          if given_name: -            given_name = clean(given_name.get_text().replace("\n", " ")) +            given_name = clean_str(given_name.get_text().replace("\n", " "))          lang = "en"          if is_cjk(name):              lang = "ja" @@ -117,50 +116,8 @@ class JalcImporter(EntityImporter):          )          self.create_containers = kwargs.get("create_containers", True) -        extid_map_file = kwargs.get("extid_map_file") -        self.extid_map_db = None -        if extid_map_file: -            db_uri = "file:{}?mode=ro".format(extid_map_file) -            print("Using external ID map: {}".format(db_uri)) -            self.extid_map_db = sqlite3.connect(db_uri, uri=True) -        else: -            print("Not using external ID map") -          self.read_issn_map_file(issn_map_file) -    def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: -        if self.extid_map_db is None: -            return dict( -                core_id=None, -                pmid=None, -                pmcid=None, -                wikidata_qid=None, -                arxiv_id=None, -                jstor_id=None, -            ) -        row = self.extid_map_db.execute( -            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] -        ).fetchone() -        if row is None: -            return dict( -                core_id=None, -                pmid=None, -                pmcid=None, -                wikidata_qid=None, -                arxiv_id=None, -                jstor_id=None, -            ) -        row = [str(cell or "") or None for cell in row] -        return dict( -            core_id=row[0], -            pmid=row[1], -            pmcid=row[2], -            wikidata_qid=row[3], -            # TODO: -            arxiv_id=None, -            jstor_id=None, -        ) -      def want(self, raw_record: Any) -> bool:          return True @@ -273,16 +230,16 @@ class JalcImporter(EntityImporter):                  for p in record.find_all("publicationName")                  if p.get_text()              ] -            pubs = [clean(p) for p in pubs if p] +            pubs = [clean_str(p) for p in pubs if p]              assert pubs              if len(pubs) > 1 and pubs[0] == pubs[1]:                  pubs = [pubs[0]]              if len(pubs) > 1 and is_cjk(pubs[0]):                  # eng/jpn ordering is not reliable                  pubs = [pubs[1], pubs[0]] -            container_name = clean(pubs[0]) +            container_name = clean_str(pubs[0])              if len(pubs) > 1: -                container_extra["original_name"] = clean(pubs[1]) +                container_extra["original_name"] = clean_str(pubs[1])          if record.publisher:              pubs = [ @@ -297,7 +254,7 @@ class JalcImporter(EntityImporter):                  # ordering is not reliable                  pubs = [pubs[1], pubs[0]]              if pubs: -                publisher = clean(pubs[0]) +                publisher = clean_str(pubs[0])                  if len(pubs) > 1:                      container_extra["publisher_aliases"] = pubs[1:] @@ -330,9 +287,6 @@ class JalcImporter(EntityImporter):          # reasonable default for this collection          release_type = "article-journal" -        # external identifiers -        extids = self.lookup_ext_ids(doi=doi) -          # extra:          #   translation_of          #   aliases @@ -342,26 +296,20 @@ class JalcImporter(EntityImporter):          # (informally)          extra["jalc"] = extra_jalc -        title = clean(title) +        title = clean_str(title)          if not title:              return None          re = ReleaseEntity(              work_id=None,              title=title, -            original_title=clean(original_title), +            original_title=clean_str(original_title),              release_type=release_type,              release_stage="published",              release_date=release_date,              release_year=release_year,              ext_ids=fatcat_openapi_client.ReleaseExtIds(                  doi=doi, -                pmid=extids["pmid"], -                pmcid=extids["pmcid"], -                wikidata_qid=extids["wikidata_qid"], -                core=extids["core_id"], -                arxiv=extids["arxiv_id"], -                jstor=extids["jstor_id"],              ),              volume=volume,              issue=issue, diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index a45e49f3..fc1dfcbd 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional  import fatcat_openapi_client  from fatcat_openapi_client import ApiClient, ContainerEntity -from .common import EntityImporter, clean +from fatcat_tools.normal import clean_str + +from .common import EntityImporter  def or_none(s: Optional[str]) -> Optional[str]: @@ -105,7 +107,7 @@ class JournalMetadataImporter(EntityImporter):          if extra_ia:              extra["ia"] = extra_ia -        name = clean(row.get("name")) +        name = clean_str(row.get("name"))          if not name:              return None @@ -115,7 +117,7 @@ class JournalMetadataImporter(EntityImporter):              issnp=row.get("issnp"),              container_type=None,  # TODO              name=name, -            publisher=clean(row.get("publisher")), +            publisher=clean_str(row.get("publisher")),              wikidata_qid=None,  # TODO              extra=extra,          ) diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 2c8aa0a4..79691c9a 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -8,7 +8,10 @@ import fatcat_openapi_client  from bs4 import BeautifulSoup  from fatcat_openapi_client import ApiClient, ReleaseEntity -from .common import LANG_MAP_MARC, EntityImporter, clean +from fatcat_tools.biblio_lookup_tables import LANG_MAP_MARC +from fatcat_tools.normal import clean_doi, clean_str + +from .common import EntityImporter  from .crossref import CONTAINER_TYPE_MAP  # TODO: more entries? @@ -138,7 +141,7 @@ class JstorImporter(EntityImporter):                  issnl=issnl,                  publisher=publisher,                  container_type=self.map_container_type(release_type), -                name=clean(journal_title, force_xml=True), +                name=clean_str(journal_title, force_xml=True),              )              ce_edit = self.create_container(ce)              container_id = ce_edit.ident @@ -146,7 +149,9 @@ class JstorImporter(EntityImporter):          doi = article_meta.find("article-id", {"pub-id-type": "doi"})          if doi: -            doi = doi.string.lower().strip() +            doi = clean_doi(doi.string.lower()) +        else: +            doi = None          jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"})          if jstor_id: @@ -162,13 +167,13 @@ class JstorImporter(EntityImporter):              for c in cgroup.find_all("contrib"):                  given = c.find("given-names")                  if given: -                    given = clean(given.get_text().replace("\n", " ")) +                    given = clean_str(given.get_text().replace("\n", " "))                  surname = c.find("surname")                  if surname: -                    surname = clean(surname.get_text().replace("\n", " ")) +                    surname = clean_str(surname.get_text().replace("\n", " "))                  raw_name = c.find("string-name")                  if raw_name: -                    raw_name = clean(raw_name.get_text().replace("\n", " ")) +                    raw_name = clean_str(raw_name.get_text().replace("\n", " "))                  if not raw_name:                      if given and surname: @@ -230,7 +235,7 @@ class JstorImporter(EntityImporter):          # JSTOR issue-id          if article_meta.find("issue-id"): -            issue_id = clean(article_meta.find("issue-id").string) +            issue_id = clean_str(article_meta.find("issue-id").string)              if issue_id:                  extra_jstor["issue_id"] = issue_id diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 430cdd0f..f3d82a86 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -4,7 +4,9 @@ from typing import Any, Dict, List, Optional  import fatcat_openapi_client  from fatcat_openapi_client import ApiClient, CreatorEntity -from .common import EntityImporter, clean +from fatcat_tools.normal import clean_str + +from .common import EntityImporter  def value_or_none(e: Any) -> Any: @@ -65,14 +67,14 @@ class OrcidImporter(EntityImporter):          if not self.is_orcid(orcid):              sys.stderr.write("Bad ORCID: {}\n".format(orcid))              return None -        display = clean(display) +        display = clean_str(display)          if not display:              # must have *some* name              return None          ce = CreatorEntity(              orcid=orcid, -            given_name=clean(given), -            surname=clean(sur), +            given_name=clean_str(given), +            surname=clean_str(sur),              display_name=display,              extra=extra,          ) diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 1cdb450b..a6c7409d 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -8,317 +8,15 @@ import fatcat_openapi_client  from bs4 import BeautifulSoup  from fatcat_openapi_client import ApiClient, ReleaseEntity -from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid - -from .common import LANG_MAP_MARC, EntityImporter, clean - -# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly -PUBMED_RELEASE_TYPE_MAP = { -    # Adaptive Clinical Trial -    "Address": "speech", -    "Autobiography": "book", -    # Bibliography -    "Biography": "book", -    # Case Reports -    "Classical Article": "article-journal", -    # Clinical Conference -    # Clinical Study -    # Clinical Trial -    # Clinical Trial, Phase I -    # Clinical Trial, Phase II -    # Clinical Trial, Phase III -    # Clinical Trial, Phase IV -    # Clinical Trial Protocol -    # Clinical Trial, Veterinary -    # Collected Works -    # Comparative Study -    # Congress -    # Consensus Development Conference -    # Consensus Development Conference, NIH -    # Controlled Clinical Trial -    "Dataset": "dataset", -    # Dictionary -    # Directory -    # Duplicate Publication -    "Editorial": "editorial", -    # English Abstract   # doesn't indicate that this is abstract-only -    # Equivalence Trial -    # Evaluation Studies -    # Expression of Concern -    # Festschrift -    # Government Document -    # Guideline -    "Historical Article": "article-journal", -    # Interactive Tutorial -    "Interview": "interview", -    "Introductory Journal Article": "article-journal", -    "Journal Article": "article-journal", -    "Lecture": "speech", -    "Legal Case": "legal_case", -    "Legislation": "legislation", -    "Letter": "letter", -    # Meta-Analysis -    # Multicenter Study -    # News -    "Newspaper Article": "article-newspaper", -    # Observational Study -    # Observational Study, Veterinary -    # Overall -    # Patient Education Handout -    # Periodical Index -    # Personal Narrative -    # Portrait -    # Practice Guideline -    # Pragmatic Clinical Trial -    # Publication Components -    # Publication Formats -    # Publication Type Category -    # Randomized Controlled Trial -    # Research Support, American Recovery and Reinvestment Act -    # Research Support, N.I.H., Extramural -    # Research Support, N.I.H., Intramural -    # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. -    # Research Support, U.S. Gov't, P.H.S. -    # Review     # in the "literature review" sense, not "product review" -    # Scientific Integrity Review -    # Study Characteristics -    # Support of Research -    # Systematic Review -    "Technical Report": "report", -    # Twin Study -    # Validation Studies -    # Video-Audio Media -    # Webcasts -} - -MONTH_ABBR_MAP = { -    "Jan": 1, -    "01": 1, -    "Feb": 2, -    "02": 2, -    "Mar": 3, -    "03": 3, -    "Apr": 4, -    "04": 4, -    "May": 5, -    "05": 5, -    "Jun": 6, -    "06": 6, -    "Jul": 7, -    "07": 7, -    "Aug": 8, -    "08": 8, -    "Sep": 9, -    "09": 9, -    "Oct": 10, -    "10": 10, -    "Nov": 11, -    "11": 11, -    "Dec": 12, -    "12": 12, -} - -# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/ -COUNTRY_NAME_MAP = { -    "Afghanistan": "af", -    "Albania": "al", -    "Algeria": "dz", -    "Andorra": "ad", -    "Angola": "ao", -    "Antigua and Barbuda": "ag", -    "Argentina": "ar", -    "Armenia": "am", -    "Australia": "au", -    "Austria": "at", -    "Azerbaijan": "az", -    "Bahamas": "bs", -    "Bahrain": "bh", -    "Bangladesh": "bd", -    "Barbados": "bb", -    "Belarus": "by", -    "Belgium": "be", -    "Belize": "bz", -    "Benin": "bj", -    "Bhutan": "bt", -    "Bolivia": "bo", -    "Bosnia and Herzegowina": "ba", -    "Botswana": "bw", -    "Brazil": "br", -    "Brunei Darussalam": "bn", -    "Bulgaria": "bg", -    "Burkina Faso": "bf", -    "Burundi": "bi", -    "Cambodia": "kh", -    "Cameroon": "cm", -    "Canada": "ca", -    "Cape Verde": "cv", -    "Central African Republic": "cf", -    "Chad": "td", -    "Chile": "cl", -    "China": "cn", -    "Colombia": "co", -    "Comoros": "km", -    "Congo, Democratic Republic": "cd", -    "Congo, People’s Republic": "cg", -    "Costa Rica": "cr", -    "Cote d'Ivoire": "ci", -    "Croatia (Local Name: Hrvatska)": "hr", -    "Cuba": "cu", -    "Cyprus": "cy", -    "Czech Republic": "cz", -    "Denmark": "dk", -    "Djibouti": "dj", -    "Dominica": "dm", -    "Dominican Republic": "do", -    "East Timor": "tl", -    "Ecuador": "ec", -    "El Salvador": "sv", -    "Equatorial Guinea": "gq", -    "Eritrea": "er", -    "Estonia": "ee", -    "Ethiopia": "et", -    "Fiji": "fj", -    "Finland": "fi", -    "France": "fr", -    "Gabon": "ga", -    "Gambia": "gm", -    "Georgia": "ge", -    "Germany": "de", -    "Ghana": "gh", -    "Greece": "gr", -    "Greenland": "gl", -    "Grenada": "gd", -    "Guatemala": "gt", -    "Guinea": "gn", -    "Guinea-Bissau": "gw", -    "Guyana": "gy", -    "Haiti": "ht", -    "Honduras": "hn", -    "Hong Kong": "hk", -    "Hungary": "hu", -    "Iceland": "is", -    "India": "in", -    "Indonesia": "id", -    "Iran": "ir", -    "Iraq": "iq", -    "Ireland": "ie", -    "Israel": "il", -    "Italy": "it", -    "Jamaica": "jm", -    "Japan": "jp", -    "Jordan": "jo", -    "Kazakhstan": "kz", -    "Kenya": "ke", -    "Kiribati": "ki", -    "Korea, Democratic People's Republic": "kp", -    "Korea, Republic": "kr", -    "Kuwait": "kw", -    "Kyrgyzstan": "kg", -    "Laos": "la", -    "Latvia": "lv", -    "Lebanon": "lb", -    "Lesotho": "ls", -    "Liberia": "lr", -    "Libya": "ly", -    "Liechtenstein": "li", -    "Lithuania": "lt", -    "Luxembourg": "lu", -    "Macedonia": "mk", -    "Madagascar": "mg", -    "Malawi": "mw", -    "Malaysia": "my", -    "Maldives": "mv", -    "Mali": "ml", -    "Malta": "mt", -    "Marshall Islands": "mh", -    "Mauritania": "mr", -    "Mauritius": "mu", -    "Mexico": "mx", -    "Micronesia": "fm", -    "Moldova": "md", -    "Monaco": "mc", -    "Mongolia": "mn", -    "Morocco": "ma", -    "Mozambique": "mz", -    "Myanmar": "mm", -    "Namibia": "na", -    "Nauru": "nr", -    "Nepal": "np", -    "Netherlands": "nl", -    "New Zealand": "nz", -    "Nicaragua": "ni", -    "Niger": "ne", -    "Nigeria": "ng", -    "Norway": "no", -    "Oman": "om", -    "Pakistan": "pk", -    "Palau": "pw", -    "Panama": "pa", -    "Papua New Guinea": "pg", -    "Paraguay": "py", -    "Peru": "pe", -    "Philippines": "ph", -    "Poland": "pl", -    "Portugal": "pt", -    "Puerto Rico": "pr", -    "Qatar": "qa", -    "Romania": "ro", -    "Russian Federation": "ru", -    "Rwanda": "rw", -    "Saint Kitts and Nevis": "kn", -    "Saint Lucia": "lc", -    "Saint Vincent and the Grenadines": "vc", -    "Samoa": "ws", -    "San Marino": "sm", -    "Sao Tome and Príncipe": "st", -    "Saudi Arabia": "sa", -    "Senegal": "sn", -    "Serbia and Montenegro": "cs", -    "Seychelles": "sc", -    "Sierra Leone": "sl", -    "Singapore": "sg", -    "Slovakia (Slovak Republic)": "sk", -    "Slovenia": "si", -    "Solomon Islands": "sb", -    "Somalia": "so", -    "South Africa": "za", -    "Spain": "es", -    "Sri Lanka": "lk", -    "Sudan": "sd", -    "Suriname": "sr", -    "Swaziland": "sz", -    "Sweden": "se", -    "Switzerland": "ch", -    "Syrian Arab Republic": "sy", -    "Taiwan": "tw", -    "Tajikistan": "tj", -    "Tanzania": "tz", -    "Tanzania": "tz", -    "Thailand": "th", -    "Togo": "tg", -    "Tonga": "to", -    "Trinidad and Tobago": "tt", -    "Tunisia": "tn", -    "Turkey": "tr", -    "Turkmenistan": "tm", -    "Tuvalu": "tv", -    "Uganda": "ug", -    "Ukraine": "ua", -    "United Arab Emirates": "ae", -    "United Kingdom": "gb", -    "United States": "us", -    "Uruguay": "uy", -    # Additions from running over large files -    "Bosnia and Herzegovina": "ba", -    # "International" -    "China (Republic : 1949- )": "tw",  # pretty sure this is tw not cn -    "Russia (Federation)": "ru", -    "Scotland": "gb", -    "England": "gb", -    "Korea (South)": "kr", -    "Georgia (Republic)": "ge", -    "Egypt": "eg", -} +from fatcat_tools.biblio_lookup_tables import ( +    COUNTRY_NAME_MAP, +    LANG_MAP_MARC, +    MONTH_ABBR_MAP, +    PUBMED_RELEASE_TYPE_MAP, +) +from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid, clean_str + +from .common import EntityImporter  class PubmedImporter(EntityImporter): @@ -704,14 +402,14 @@ class PubmedImporter(EntityImporter):          if extra_pubmed:              extra["pubmed"] = extra_pubmed -        title = clean(title) +        title = clean_str(title)          if not title:              return None          re = fatcat_openapi_client.ReleaseEntity(              work_id=None,              title=title, -            original_title=clean(original_title), +            original_title=clean_str(original_title),              release_type=release_type,              release_stage=release_stage,              release_date=release_date, diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py deleted file mode 100755 index 5caed2c7..00000000 --- a/python/fatcat_tools/importers/wayback_static.py +++ /dev/null @@ -1,287 +0,0 @@ -#!/usr/bin/env python3 - -""" -Helpers to create Web Capture entities from extracted wayback content. - -Works as a stand-alone script (for debugging) or as library routines. -""" - -import argparse -import datetime -import hashlib -import json -import subprocess -import sys -from typing import Any, Dict, List, Optional, Tuple - -import requests -from bs4 import BeautifulSoup -from fatcat_openapi_client import ( -    ApiClient, -    Editgroup, -    EntityEdit, -    WebcaptureCdxLine, -    WebcaptureEntity, -    WebcaptureUrl, -) - -from .common import b32_hex - -CDX_API_BASE = "https://web.archive.org/cdx/search/cdx" -GWB_URL_BASE = "https://web.archive.org/web" -REQ_SESSION = requests.Session() - - -def parse_wbm_url(url: str) -> Tuple[str, datetime.datetime, str]: -    """Takes a wayback machine URL, and returns a tuple: - -    (timestamp, datetime, original_url) -    """ -    chunks = url.split("/") -    assert len(chunks) >= 6 -    assert chunks[2] == "web.archive.org" -    assert chunks[3] == "web" -    return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:])) - - -def test_parse_wbm_url() -> None: -    u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html" -    assert parse_wbm_url(u) == ( -        "20010712114837", -        datetime.datetime(2001, 7, 12, 11, 48, 37), -        "http://www.dlib.org/dlib/june01/reich/06reich.html", -    ) - - -def parse_wbm_timestamp(timestamp: str) -> datetime.datetime: -    """ -    Takes a complete WBM timestamp string (like "20020327115625") and returns a -    python datetime object (UTC) -    """ -    # strip any "im_" or "id_" suffix -    if timestamp.endswith("_"): -        timestamp = timestamp[:-3] -    # inflexible; require the full second-precision timestamp -    assert len(timestamp) == 14 -    return datetime.datetime( -        year=int(timestamp[0:4]), -        month=int(timestamp[4:6]), -        day=int(timestamp[6:8]), -        hour=int(timestamp[8:10]), -        minute=int(timestamp[10:12]), -        second=int(timestamp[12:14]), -    ) - - -def test_parse_wbm_timestamp() -> None: -    assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37) - - -def fetch_wbm(url: str) -> bytes: -    resp = REQ_SESSION.get(url) -    resp.raise_for_status() -    assert resp.content -    return resp.content - - -def lookup_cdx( -    embed_url: str, verify_hashes: bool = True, cdx_output: Any = None -) -> Optional[WebcaptureCdxLine]: -    sys.stderr.write(embed_url + "\n") -    assert embed_url.startswith("/web/") -    embed_url_segments = embed_url.split("/") -    timestamp = embed_url_segments[2] -    if timestamp.endswith("_"): -        timestamp = timestamp[:-3] -    url = "/".join(embed_url_segments[3:]) -    # print((timestamp, url)) -    params: Dict = dict( -        url=url, -        closest=timestamp, -        sort="closest", -        resolveRevisits="true", -        matchType="exact", -        limit=1, -    ) -    resp = REQ_SESSION.get( -        CDX_API_BASE, -        params=params, -    ) -    resp.raise_for_status() -    # print(resp.url) -    if resp.content: -        hit = resp.content.decode("utf-8").split("\n")[0] -        if cdx_output: -            cdx_output.write(hit + "\n") -        cdx_chunks = hit.split(" ") -        cdx = [x if (x and x != "-") else None for x in cdx_chunks] -        webcapture_cdx = WebcaptureCdxLine( -            surt=cdx[0], -            timestamp=parse_wbm_timestamp(cdx[1] or "").isoformat() + "Z", -            url=cdx[2], -            mimetype=cdx[3], -            status_code=int(cdx[4] or ""), -            sha1=b32_hex(cdx[5] or ""), -            sha256=None, -        ) -        if verify_hashes: -            resp = REQ_SESSION.get( -                GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url)  # raw timestamp -            ) -            resp.raise_for_status() -            assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex() -            webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex() -            webcapture_cdx.size = len(resp.content) -        return webcapture_cdx -    else: -        return None - - -def wayback_url_to_relative(url: str) -> Optional[str]: -    """ -    Wayback URLs can be relative or absolute in rewritten documents. This -    function converts any form of rewritten URL to a relative (to -    web.archive.org) one, or returns None if it isn't a rewritten URL at all. -    """ -    if url.startswith("https://web.archive.org/"): -        url = url[23:] -    elif url.startswith("http://web.archive.org/"): -        url = url[22:] - -    if url.startswith("/web/"): -        return url -    else: -        return None - - -def extract_embeds(soup: BeautifulSoup) -> List[str]: - -    embeds = set() - -    # <link href=""> -    for tag in soup.find_all("link", href=True): -        if tag["rel"] not in ("stylesheet",): -            continue -        url = wayback_url_to_relative(tag["href"]) -        if url: -            embeds.add(url) -    # <img src=""> -    for tag in soup.find_all("img", src=True): -        url = wayback_url_to_relative(tag["src"]) -        if url: -            embeds.add(url) - -    # <script src=""> -    for tag in soup.find_all("script", src=True): -        url = wayback_url_to_relative(tag["src"]) -        if url: -            embeds.add(url) - -    return list(embeds) - - -def static_wayback_webcapture(wayback_url: str, cdx_output: Any = None) -> WebcaptureEntity: -    """ -    Given a complete wayback machine capture URL, like: - -        http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html - -    Will return a new ("bare") fatcat webcapture entity python object, with all -    the CDX entries filled in. -    """ - -    wbm_html = fetch_wbm(wayback_url) -    raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) -    # with open(rewritten_path, 'r') as fp: -    #    soup = BeautifulSoup(fp, "lxml") -    soup = BeautifulSoup(wbm_html, "lxml") -    embeds = extract_embeds(soup) -    cdx_obj = lookup_cdx( -        "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output -    ) -    cdx_list = [cdx_obj] -    for url in embeds: -        cdx_obj = lookup_cdx(url, cdx_output=cdx_output) -        cdx_list.append(cdx_obj) -    archive_urls = [ -        WebcaptureUrl( -            rel="wayback", -            url="https://web.archive.org/web/", -        ) -    ] -    wc = WebcaptureEntity( -        cdx=cdx_list, -        timestamp=timestamp.isoformat() + "Z", -        original_url=original_url, -        archive_urls=archive_urls, -        release_ids=None, -    ) -    return wc - - -def auto_wayback_static( -    api: ApiClient, release_id: str, wayback_url: str, editgroup_id: Optional[str] = None -) -> Tuple[Optional[str], Optional[EntityEdit]]: -    """ -    Returns a tuple: (editgroup_id, edit). If failed, both are None -    """ - -    raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) -    git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") - -    release = api.get_release(release_id, expand="webcaptures") - -    # check for existing webcapture with same parameters -    for wc in release.webcaptures: -        if wc.original_url == original_url and wc.timestamp.date() == timestamp.date(): -            # skipping: already existed -            print( -                "release {} already had webcapture {} {}".format( -                    release_id, raw_timestamp, original_url -                ) -            ) -            return (None, None) - -    wc = static_wayback_webcapture(wayback_url) -    assert len(wc.cdx) >= 1 -    wc.release_ids = [release_id] -    if not editgroup_id: -        eg = api.create_editgroup( -            Editgroup( -                description="One-off import of static web content from wayback machine", -                extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"), -            ) -        ) -        editgroup_id = eg.editgroup_id -    edit = api.create_webcapture(eg.editgroup_id, wc) -    return (editgroup_id, edit) - - -def main() -> None: -    parser = argparse.ArgumentParser() -    parser.add_argument("--verbose", action="store_true", help="verbose output") -    parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from") -    parser.add_argument( -        "--json-output", -        type=argparse.FileType("w"), -        default=sys.stdout, -        help="where to write out webcapture entity (as JSON)", -    ) -    parser.add_argument( -        "--cdx-output", -        type=argparse.FileType("w"), -        default=None, -        help="(optional) file to write out CDX stub", -    ) - -    args = parser.parse_args() - -    # entity-to-JSON code; duplicate of entity_to_dict() -    api_client = ApiClient() -    wc = static_wayback_webcapture(args.wayback_url, cdx_output=args.cdx_output) -    wc_dict = api_client.sanitize_for_serialization(wc) -    print(json.dumps(wc_dict)) - - -if __name__ == "__main__": -    main() | 
