aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/__init__.py8
-rw-r--r--python/fatcat_tools/importers/arabesque.py4
-rw-r--r--python/fatcat_tools/importers/arxiv.py6
-rwxr-xr-xpython/fatcat_tools/importers/cdl_dash_dat.py219
-rw-r--r--python/fatcat_tools/importers/chocula.py8
-rw-r--r--python/fatcat_tools/importers/common.py69
-rw-r--r--python/fatcat_tools/importers/crossref.py209
-rw-r--r--python/fatcat_tools/importers/datacite.py237
-rw-r--r--python/fatcat_tools/importers/doaj_article.py5
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py39
-rw-r--r--python/fatcat_tools/importers/jalc.py74
-rw-r--r--python/fatcat_tools/importers/journal_metadata.py8
-rw-r--r--python/fatcat_tools/importers/jstor.py19
-rw-r--r--python/fatcat_tools/importers/orcid.py10
-rw-r--r--python/fatcat_tools/importers/pubmed.py324
-rwxr-xr-xpython/fatcat_tools/importers/wayback_static.py287
16 files changed, 146 insertions, 1380 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 06ecfd58..654be2e9 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -13,10 +13,8 @@ To run an import you combine two classes; one each of:
from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter
from .arxiv import ArxivRawImporter
-from .cdl_dash_dat import auto_cdl_dash_dat
from .chocula import ChoculaImporter
from .common import (
- LANG_MAP_MARC,
Bs4XmlFileListPusher,
Bs4XmlFilePusher,
Bs4XmlLargeFilePusher,
@@ -28,11 +26,8 @@ from .common import (
KafkaJsonPusher,
LinePusher,
SqlitePusher,
- clean,
- is_cjk,
- make_kafka_consumer,
)
-from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug
+from .crossref import CrossrefImporter
from .datacite import DataciteImporter
from .dblp_container import DblpContainerImporter
from .dblp_release import DblpReleaseImporter
@@ -55,4 +50,3 @@ from .matched import MatchedImporter
from .orcid import OrcidImporter
from .pubmed import PubmedImporter
from .shadow import ShadowLibraryImporter
-from .wayback_static import auto_wayback_static
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index b4a4d9ed..92289bb3 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, FileEntity
-from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url
+from fatcat_tools.normal import b32_hex
+
+from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, make_rel_url
ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL"
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index 1d50dd9a..dd2c2284 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -9,6 +9,8 @@ from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseEntity
from pylatexenc.latex2text import LatexNodes2Text
+from fatcat_tools.normal import clean_doi
+
from .common import EntityImporter
from .crossref import lookup_license_slug
@@ -127,8 +129,8 @@ class ArxivRawImporter(EntityImporter):
base_id = metadata.id.string
doi = None
if metadata.doi and metadata.doi.string:
- doi = metadata.doi.string.lower().split()[0].strip()
- if not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]):
+ doi = clean_doi(metadata.doi.string.lower().split()[0].strip())
+ if doi and not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]):
sys.stderr.write("BOGUS DOI: {}\n".format(doi))
doi = None
title = latex_to_text(metadata.title.get_text().replace("\n", " "))
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
deleted file mode 100755
index 1a4114a0..00000000
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ /dev/null
@@ -1,219 +0,0 @@
-#!/usr/bin/env python3
-
-import hashlib
-import json
-import mimetypes
-import os
-import subprocess
-import sys
-import urllib
-import urllib.parse
-from typing import Any, Dict, List, Optional, Tuple
-
-import fatcat_openapi_client
-import magic
-from fatcat_openapi_client import (
- ApiClient,
- Editgroup,
- FilesetEntity,
- FilesetFile,
- ReleaseAbstract,
- ReleaseContrib,
- ReleaseEntity,
- ReleaseExtIds,
-)
-
-from .common import clean
-from .crossref import lookup_license_slug
-
-
-def single_file(prefix: str, path: str) -> FilesetFile:
-
- full = prefix + path
- size_bytes = os.stat(full).st_size
-
- hashes = [
- hashlib.md5(),
- hashlib.sha1(),
- hashlib.sha256(),
- ]
- with open(full, "rb") as fp:
- while True:
- data = fp.read(2 ** 20)
- if not data:
- break
- for h in hashes:
- h.update(data)
- mime = magic.Magic(mime=True).from_file(full)
- if mime == "application/octet-stream":
- # magic apparently isn't that great; try using filename as well
- guess = mimetypes.guess_type(full)[0]
- if guess:
- mime = guess
-
- fsf = FilesetFile(
- path=path,
- size=size_bytes,
- md5=hashes[0].hexdigest(),
- sha1=hashes[1].hexdigest(),
- sha256=hashes[2].hexdigest(),
- extra=dict(mimetype=mime),
- )
- return fsf
-
-
-def make_manifest(base_dir: str) -> List[FilesetFile]:
- manifest = []
- for root, dirs, files in os.walk(base_dir):
- for f in files:
- manifest.append(single_file(root, f))
- return manifest
-
-
-def cdl_dash_release(
- meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None
-) -> ReleaseEntity:
-
- if not extra:
- extra = dict()
-
- assert meta["identifier"]["type"] == "DOI"
- doi = meta["identifier"]["value"].lower()
- assert doi.startswith("10.")
-
- ark_id = None
- for extid in meta.get("alternativeIdentifiers", []):
- if extid["value"].startswith("ark:"):
- ark_id = extid["value"]
- assert ark_id
-
- license_slug = lookup_license_slug(meta["rights"]["uri"])
-
- abstracts = []
- for desc in meta["descriptions"]:
- if desc["type"] == "abstract":
- abstracts.append(
- ReleaseAbstract(mimetype="text/html", content=clean(desc["value"]))
- )
- # print(abstracts)
-
- contribs = []
- for creator in meta["creator"]:
- contribs.append(
- ReleaseContrib(
- given_name=creator["given"],
- surname=creator["family"],
- # sorry everybody
- raw_name="{} {}".format(creator["given"], creator["family"]),
- raw_affiliation=creator.get("affiliation"),
- role="author", # presumably, for these datasets?
- )
- )
-
- r = ReleaseEntity(
- ext_ids=ReleaseExtIds(
- doi=doi,
- ark=ark_id,
- ),
- title=clean(meta["title"], force_xml=True),
- publisher=clean(meta["publisher"]),
- release_year=int(meta["publicationYear"]),
- release_type="dataset",
- license_slug=license_slug,
- contribs=contribs,
- abstracts=abstracts or None,
- extra=extra,
- )
- return r
-
-
-def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]:
-
- if dat_path.endswith("/"):
- dat_path = dat_path[:-1]
- dat_discovery = dat_path
- extra = dict()
- assert len(dat_discovery) == 64
-
- with open(dat_path + "/cdl_dash_metadata.json", "r") as fp:
- meta_dict = json.loads(fp.read())
-
- release = cdl_dash_release(meta_dict)
- ark_id = release.extra["ark_id"]
-
- dash_version = None
- # really crude XML parse-out
- with open(dat_path + "/stash-wrapper.xml", "r") as fp:
- for line in fp:
- line = line.strip()
- if line.startswith("<st:version_number>"):
- dash_version = int(line[19:].split("<")[0])
- assert dash_version is not None
- extra["cdl_dash"] = dict(version=dash_version)
- release.extra["cdl_dash"] = dict(version=dash_version)
-
- manifest = make_manifest(dat_path + "/files/")
-
- bundle_url = dict(
- url="https://merritt.cdlib.org/u/{}/{}".format(
- urllib.parse.quote(ark_id, safe=""), dash_version
- ),
- rel="repo-bundle",
- )
- repo_url = dict(
- url="https://merritt.cdlib.org/d/{}/{}/".format(
- urllib.parse.quote(ark_id, safe=""), dash_version
- ),
- rel="repo",
- )
- dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb")
- fs = FilesetEntity(
- urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra
- )
- return (release, fs)
-
-
-def auto_cdl_dash_dat(
- api: ApiClient,
- dat_path: str,
- release_id: Optional[str] = None,
- editgroup_id: Optional[str] = None,
-) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]:
-
- git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
-
- (release, fileset) = make_release_fileset(dat_path)
-
- if not editgroup_id:
- eg = api.create_editgroup(
- Editgroup(
- description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
- extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"),
- )
- )
- editgroup_id = eg.editgroup_id
-
- if not release_id and release.ext_ids.doi:
- try:
- r = api.lookup_release(doi=release.ext_ids.doi)
- release_id = r.ident
- except fatcat_openapi_client.rest.ApiException:
- pass
- if not release_id:
- edit = api.create_release(eg.editgroup_id, release)
- release_id = edit.ident
-
- release = api.get_release(release_id, expand="filesets")
- if len(release.filesets):
- print("A fileset already exists for release {}".format(release.ident))
- return (None, None, None)
-
- fileset.release_ids = [release.ident]
- edit = api.create_fileset(eg.editgroup_id, fileset)
- fileset = api.get_fileset(edit.ident)
- return (editgroup_id, release, fileset)
-
-
-if __name__ == "__main__":
- # pass this a discovery key that has been cloned to the local directory
- print(make_release_fileset(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 842c7853..c44fec3b 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, ContainerEntity
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
class ChoculaImporter(EntityImporter):
@@ -40,7 +42,7 @@ class ChoculaImporter(EntityImporter):
returns a ContainerEntity (or None if invalid or couldn't parse)
"""
- name = clean(row.get("name"))
+ name = clean_str(row.get("name"))
if not name:
# Name is required (by schema)
return None
@@ -85,7 +87,7 @@ class ChoculaImporter(EntityImporter):
ident=row["ident"],
name=name,
container_type=container_type,
- publisher=clean(row.get("publisher")),
+ publisher=clean_str(row.get("publisher")),
wikidata_qid=row.get("wikidata_qid"),
extra=extra,
)
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 2ec6efda..e2157ee5 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -27,74 +27,14 @@ from fatcat_openapi_client import (
from fatcat_openapi_client.rest import ApiException
from fuzzycat.matching import match_release_fuzzy
-# TODO: refactor so remove need for this (re-imports for backwards compatibility)
-from fatcat_tools.normal import is_cjk # noqa: F401
-from fatcat_tools.normal import LANG_MAP_MARC, b32_hex # noqa: F401
-from fatcat_tools.normal import clean_str as clean # noqa: F401
+from fatcat_tools.biblio_lookup_tables import DOMAIN_REL_MAP
+from fatcat_tools.normal import clean_doi
from fatcat_tools.transforms import entity_to_dict
DATE_FMT: str = "%Y-%m-%d"
SANE_MAX_RELEASES: int = 200
SANE_MAX_URLS: int = 100
-
-DOMAIN_REL_MAP: Dict[str, str] = {
- "archive.org": "archive",
- # LOCKSS, Portico, DuraSpace, etc would also be "archive"
- "arxiv.org": "repository",
- "babel.hathitrust.org": "repository",
- "cds.cern.ch": "repository",
- "deepblue.lib.umich.edu": "repository",
- "europepmc.org": "repository",
- "hal.inria.fr": "repository",
- "scielo.isciii.es": "repository",
- "www.dtic.mil": "repository",
- "www.jstage.jst.go.jp": "repository",
- "www.jstor.org": "repository",
- "www.ncbi.nlm.nih.gov": "repository",
- "ftp.ncbi.nlm.nih.gov": "repository",
- "www.scielo.br": "repository",
- "www.scielo.cl": "repository",
- "www.scielo.org.mx": "repository",
- "zenodo.org": "repository",
- "www.biorxiv.org": "repository",
- "www.medrxiv.org": "repository",
- "citeseerx.ist.psu.edu": "aggregator",
- "publisher-connector.core.ac.uk": "aggregator",
- "core.ac.uk": "aggregator",
- "static.aminer.org": "aggregator",
- "aminer.org": "aggregator",
- "pdfs.semanticscholar.org": "aggregator",
- "semanticscholar.org": "aggregator",
- "www.semanticscholar.org": "aggregator",
- "academic.oup.com": "publisher",
- "cdn.elifesciences.org": "publisher",
- "cell.com": "publisher",
- "dl.acm.org": "publisher",
- "downloads.hindawi.com": "publisher",
- "elifesciences.org": "publisher",
- "iopscience.iop.org": "publisher",
- "journals.plos.org": "publisher",
- "link.springer.com": "publisher",
- "onlinelibrary.wiley.com": "publisher",
- "works.bepress.com": "publisher",
- "www.biomedcentral.com": "publisher",
- "www.cell.com": "publisher",
- "www.nature.com": "publisher",
- "www.pnas.org": "publisher",
- "www.tandfonline.com": "publisher",
- "www.frontiersin.org": "publisher",
- "www.degruyter.com": "publisher",
- "www.mdpi.com": "publisher",
- "www.ahajournals.org": "publisher",
- "ehp.niehs.nih.gov": "publisher",
- "journals.tsu.ru": "publisher",
- "www.cogentoa.com": "publisher",
- "www.researchgate.net": "academicsocial",
- "academia.edu": "academicsocial",
- "wayback.archive-it.org": "webarchive",
- "web.archive.org": "webarchive",
- "archive.is": "webarchive",
-}
+MAX_ABSTRACT_LENGTH: int = 2048
def make_rel_url(raw_url: str, default_link_rel: str = "web") -> Tuple[str, str]:
@@ -342,8 +282,7 @@ class EntityImporter:
return creator_id
def is_doi(self, doi: str) -> bool:
- # TODO: replace with clean_doi() from fatcat_tools.normal
- return doi.startswith("10.") and doi.count("/") >= 1
+ return clean_doi(doi) is not None
def lookup_doi(self, doi: str) -> Optional[str]:
"""Caches calls to the doi lookup API endpoint in a local dict
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index a41e2bf5..52bd7465 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -1,11 +1,13 @@
import datetime
-import sqlite3
from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
-from .common import EntityImporter, clean
+from fatcat_tools.biblio_lookup_tables import CONTAINER_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
+
+from .common import EntityImporter
# The docs/guide should be the canonical home for these mappings; update there
# first
@@ -32,104 +34,11 @@ CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = {
"standard": "standard",
}
-CONTAINER_TYPE_MAP: Dict[str, str] = {
- "article-journal": "journal",
- "paper-conference": "conference",
- "book": "book-series",
-}
-
-# These are based, informally, on sorting the most popular licenses found in
-# Crossref metadata. There were over 500 unique strings and only a few most
-# popular are here; many were variants of the CC URLs. Would be useful to
-# normalize CC licenses better.
-# The current norm is to only add license slugs that are at least partially OA.
-LICENSE_SLUG_MAP: Dict[str, str] = {
- "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
- "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
- "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
- "//creativecommons.org/licenses/by/2.0/": "CC-BY",
- "//creativecommons.org/licenses/by/3.0/": "CC-BY",
- "//creativecommons.org/licenses/by/4.0/": "CC-BY",
- "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
- "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
- "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
- "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
- "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
- "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
- "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
- "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
- "//spdx.org/licenses/CC0-1.0.json": "CC-0",
- "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
- "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
- "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
- "//spdx.org/licenses/MIT.json": "MIT",
- "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
- "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "//www.karger.com/Services/SiteLicenses": "KARGER",
- "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE",
- "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY",
- "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
- "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER",
- "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA",
- "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC",
- # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
- # //www.springer.com/tdm doesn't seem like a license
- # //iopscience.iop.org/page/copyright is closed
- # //www.acm.org/publications/policies/copyright_policy#Background is closed
- # //rsc.li/journals-terms-of-use is closed for vor (am open)
- # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!)
- "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
-}
-
-
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
- if not raw:
- return None
- raw = raw.strip().replace("http://", "//").replace("https://", "//")
- if "creativecommons.org" in raw.lower():
- raw = raw.lower()
- raw = raw.replace("/legalcode", "/").replace("/uk", "")
- if not raw.endswith("/"):
- raw = raw + "/"
- return LICENSE_SLUG_MAP.get(raw)
-
-
-def test_lookup_license_slug() -> None:
-
- assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
- assert (
- lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
- == "CC-BY"
- )
- assert (
- lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
- == "CC-0"
- )
- assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
- assert (
- lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
- == "CC-BY-NC-SA"
- )
- assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
- assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
- assert lookup_license_slug("") is None
- assert lookup_license_slug(None) is None
-
class CrossrefImporter(EntityImporter):
"""
Importer for Crossref metadata.
- Can use a local sqlite3 file for faster "external identifier" lookups
-
See https://github.com/CrossRef/rest-api-doc for JSON schema notes
"""
@@ -150,50 +59,8 @@ class CrossrefImporter(EntityImporter):
)
self.create_containers: bool = kwargs.get("create_containers", True)
- extid_map_file = kwargs.get("extid_map_file")
- self.extid_map_db: Optional[Any] = None
- if extid_map_file:
- db_uri = "file:{}?mode=ro".format(extid_map_file)
- print("Using external ID map: {}".format(db_uri))
- self.extid_map_db = sqlite3.connect(db_uri, uri=True)
- else:
- print("Not using external ID map")
-
self.read_issn_map_file(issn_map_file)
- def lookup_ext_ids(self, doi: str) -> Optional[Any]:
- if self.extid_map_db is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
- ).fetchone()
- if row is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = [str(cell or "") or None for cell in row]
- return dict(
- core_id=row[0],
- pmid=row[1],
- pmcid=row[2],
- wikidata_qid=row[3],
- # TODO:
- arxiv_id=None,
- jstor_id=None,
- )
-
def map_release_type(self, crossref_type: str) -> Optional[str]:
return CROSSREF_TYPE_MAP.get(crossref_type)
@@ -275,21 +142,21 @@ class CrossrefImporter(EntityImporter):
if len(affiliation_list) > 1:
# note: affiliation => more_affiliations
extra["more_affiliations"] = [
- clean(a["name"]) for a in affiliation_list[1:]
+ clean_str(a["name"]) for a in affiliation_list[1:]
]
if am.get("sequence") and am.get("sequence") != "additional":
- extra["seq"] = clean(am.get("sequence"))
+ extra["seq"] = clean_str(am.get("sequence"))
assert ctype in ("author", "editor", "translator")
- raw_name = clean(raw_name)
+ raw_name = clean_str(raw_name)
# TODO: what if 'raw_name' is None?
contribs.append(
ReleaseContrib(
creator_id=creator_id,
index=index,
raw_name=raw_name,
- given_name=clean(am.get("given")),
- surname=clean(am.get("family")),
- raw_affiliation=clean(raw_affiliation),
+ given_name=clean_str(am.get("given")),
+ surname=clean_str(am.get("family")),
+ raw_affiliation=clean_str(raw_affiliation),
role=ctype,
extra=extra or None,
)
@@ -306,11 +173,11 @@ class CrossrefImporter(EntityImporter):
container_id = None
if issnl:
container_id = self.lookup_issnl(issnl)
- publisher = clean(obj.get("publisher"))
+ publisher = clean_str(obj.get("publisher"))
container_name = obj.get("container-title")
if container_name:
- container_name = clean(container_name[0], force_xml=True)
+ container_name = clean_str(container_name[0], force_xml=True)
if not container_name:
container_name = None
if (
@@ -366,7 +233,7 @@ class CrossrefImporter(EntityImporter):
ref_extra["journal-title"] = rm["journal-title"]
if rm.get("DOI"):
ref_extra["doi"] = rm.get("DOI").lower()
- author = clean(rm.get("author"))
+ author = clean_str(rm.get("author"))
if author:
ref_extra["authors"] = [author]
for k in (
@@ -390,8 +257,8 @@ class CrossrefImporter(EntityImporter):
"series-title",
"volume-title",
):
- if clean(rm.get(k)):
- ref_extra[k] = clean(rm[k])
+ if clean_str(rm.get(k)):
+ ref_extra[k] = clean_str(rm[k])
refs.append(
fatcat_openapi_client.ReleaseRef(
index=i,
@@ -399,9 +266,9 @@ class CrossrefImporter(EntityImporter):
target_release_id=None,
key=key,
year=year,
- container_name=clean(ref_container_name),
- title=clean(rm.get("article-title")),
- locator=clean(rm.get("first-page")),
+ container_name=clean_str(ref_container_name),
+ title=clean_str(rm.get("article-title")),
+ locator=clean_str(rm.get("first-page")),
# TODO: just dump JSON somewhere here?
extra=ref_extra or None,
)
@@ -409,7 +276,7 @@ class CrossrefImporter(EntityImporter):
# abstracts
abstracts = []
- abstract = clean(obj.get("abstract"))
+ abstract = clean_str(obj.get("abstract"))
if abstract and len(abstract) > 10:
abstracts.append(
fatcat_openapi_client.ReleaseAbstract(
@@ -430,9 +297,9 @@ class CrossrefImporter(EntityImporter):
if type(val) == list:
val = val[0]
if type(val) == str:
- val = clean(val)
+ val = clean_str(val)
if val:
- extra[key] = clean(val)
+ extra[key] = clean_str(val)
else:
extra[key] = val
# crossref-nested extra keys
@@ -440,14 +307,14 @@ class CrossrefImporter(EntityImporter):
val = obj.get(key)
if val:
if type(val) == str:
- extra_crossref[key] = clean(val)
+ extra_crossref[key] = clean_str(val)
else:
extra_crossref[key] = val
if license_extra:
extra_crossref["license"] = license_extra
if len(obj["title"]) > 1:
- aliases = [clean(t) for t in obj["title"][1:]]
+ aliases = [clean_str(t) for t in obj["title"][1:]]
aliases = [t for t in aliases if t]
if aliases:
extra["aliases"] = aliases
@@ -473,9 +340,6 @@ class CrossrefImporter(EntityImporter):
# unknown
release_stage = None
- # external identifiers
- extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {}
-
# filter out unreasonably huge releases
if len(abstracts) > 100:
self.counts["skip-huge-abstracts"] += 1
@@ -505,19 +369,24 @@ class CrossrefImporter(EntityImporter):
if obj.get("original-title"):
ot = obj.get("original-title")
if ot is not None:
- original_title = clean(ot[0], force_xml=True)
+ original_title = clean_str(ot[0], force_xml=True)
title: Optional[str] = None
if obj.get("title"):
- title = clean(obj["title"][0], force_xml=True)
+ title = clean_str(obj["title"][0], force_xml=True)
if not title or len(title) <= 1:
# title can't be just a single character
self.counts["skip-blank-title"] += 1
return None
+ doi = clean_doi(obj["DOI"].lower())
+ if not doi:
+ self.counts["skip-bad-doi"] += 1
+ return None
+
subtitle = None
if obj.get("subtitle"):
- subtitle = clean(obj["subtitle"][0], force_xml=True)
+ subtitle = clean_str(obj["subtitle"][0], force_xml=True)
if not subtitle or len(subtitle) <= 1:
# subtitle can't be just a single character
subtitle = None
@@ -537,19 +406,13 @@ class CrossrefImporter(EntityImporter):
release_year=release_year,
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
- doi=obj["DOI"].lower(),
- pmid=extids["pmid"],
- pmcid=extids["pmcid"],
- wikidata_qid=extids["wikidata_qid"],
+ doi=doi,
isbn13=isbn13,
- core=extids["core_id"],
- arxiv=extids["arxiv_id"],
- jstor=extids["jstor_id"],
),
- volume=clean(obj.get("volume")),
- issue=clean(obj.get("issue")),
- pages=clean(obj.get("page")),
- language=clean(obj.get("language")),
+ volume=clean_str(obj.get("volume")),
+ issue=clean_str(obj.get("issue")),
+ pages=clean_str(obj.get("page")),
+ language=clean_str(obj.get("language")),
license_slug=license_slug,
extra=extra or None,
abstracts=abstracts or None,
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index d5622960..b310f8bc 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -12,7 +12,6 @@ import collections
import datetime
import json
import re
-import sqlite3
import sys
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
@@ -22,113 +21,19 @@ import langdetect
import pycountry
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.biblio_lookup_tables import DATACITE_TYPE_MAP
+from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug
from fatcat_tools.transforms import entity_to_dict
-from .common import EntityImporter, clean
-
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter
# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
-CONTAINER_TYPE_MAP: Dict[str, str] = {
+DATACITE_CONTAINER_TYPE_MAP: Dict[str, str] = {
"Journal": "journal",
"Series": "journal",
"Book Series": "book-series",
}
-# The docs/guide should be the canonical home for these mappings; update there
-# first. Map various datacite type types to CSL-ish types. None means TODO or
-# remove.
-DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = {
- "ris": {
- "THES": "thesis",
- "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report)
- "CHAP": "chapter",
- "FIGURE": "figure",
- "RPRT": "report",
- "JOUR": "article-journal",
- "MPCT": "motion_picture",
- "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset
- "BOOK": "book",
- "DATA": "dataset",
- "COMP": "software",
- },
- "schemaOrg": {
- "Dataset": "dataset",
- "Book": "book",
- "ScholarlyArticle": "article-journal",
- "ImageObject": "graphic",
- "Collection": None,
- "MediaObject": None,
- "Event": None,
- "SoftwareSourceCode": "software",
- "Chapter": "chapter",
- "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
- "PublicationIssue": "article",
- "AudioObject": None,
- "Thesis": "thesis",
- },
- "citeproc": {
- "article": "article",
- "article-journal": "article-journal",
- "article-magazine": "article-magazine",
- "article-newspaper": "article-newspaper",
- "bill": "bill",
- "book": "book",
- "broadcast": "broadcast",
- "chapter": "chapter",
- "dataset": "dataset",
- "entry-dictionary": "entry-dictionary",
- "entry-encyclopedia": "entry-encyclopedia",
- "entry": "entry",
- "figure": "figure",
- "graphic": "graphic",
- "interview": "interview",
- "legal_case": "legal_case",
- "legislation": "legislation",
- "manuscript": "manuscript",
- "map": "map",
- "motion_picture": "motion_picture",
- "musical_score": "musical_score",
- "pamphlet": "pamphlet",
- "paper-conference": "paper-conference",
- "patent": "patent",
- "personal_communication": "personal_communication",
- "post": "post",
- "post-weblog": "post-weblog",
- "report": "report",
- "review-book": "review-book",
- "review": "review",
- "song": "song",
- "speech": "speech",
- "thesis": "thesis",
- "treaty": "treaty",
- "webpage": "webpage",
- }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
- "bibtex": {
- "phdthesis": "thesis",
- "inbook": "chapter",
- "misc": None,
- "article": "article-journal",
- "book": "book",
- },
- "resourceTypeGeneral": {
- "Image": "graphic",
- "Dataset": "dataset",
- "PhysicalObject": None,
- "Collection": None,
- "Text": None, # "Greyliterature, labnotes, accompanyingmaterials"
- "Sound": None,
- "InteractiveResource": None,
- "Event": None,
- "Software": "software",
- "Other": None,
- "Workflow": None,
- "Audiovisual": None,
- }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
-}
-
# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
DATACITE_UNKNOWN_MARKERS: List[str] = [
"(:unac)", # temporarily inaccessible
@@ -181,43 +86,6 @@ DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [
}
]
-# TODO(martin): merge this with other maps and lookup functions, eventually.
-LICENSE_SLUG_MAP: Dict[str, str] = {
- "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
- "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK",
- "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
- "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1",
- "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET",
- "//onlinelibrary.wiley.com/termsandconditions/": "WILEY",
- "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN",
- "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY",
- "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE",
- "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC",
- "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA",
- "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER",
- "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0",
- "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3",
- "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2",
- "//www.karger.com/Services/SiteLicenses/": "KARGER",
- "//www.springer.com/tdm/": "SPRINGER-TDM",
- "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/": "CC-0",
- "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0",
- "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0",
- "//spdx.org/licenses/CC0-1.0.json": "CC-0",
- "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY",
- "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC",
- "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA",
- "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA",
- "//spdx.org/licenses/MIT.json": "MIT",
- "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada",
-}
-
class DataciteImporter(EntityImporter):
"""
@@ -248,15 +116,6 @@ class DataciteImporter(EntityImporter):
)
self.create_containers = kwargs.get("create_containers", True)
- extid_map_file = kwargs.get("extid_map_file")
- self.extid_map_db = None
- if extid_map_file:
- db_uri = "file:{}?mode=ro".format(extid_map_file)
- print("Using external ID map: {}".format(db_uri), file=sys.stderr)
- self.extid_map_db = sqlite3.connect(db_uri, uri=True)
- else:
- print("Not using external ID map", file=sys.stderr)
-
self.read_issn_map_file(issn_map_file)
self.debug = debug
self.insert_log_file = insert_log_file
@@ -264,42 +123,6 @@ class DataciteImporter(EntityImporter):
print("datacite with debug={}".format(self.debug), file=sys.stderr)
- def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
- """
- Return dictionary of identifiers referring to the same things as the given DOI.
- """
- if self.extid_map_db is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
- ).fetchone()
- if row is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = [str(cell or "") or None for cell in row]
- return dict(
- core_id=row[0],
- pmid=row[1],
- pmcid=row[2],
- wikidata_qid=row[3],
- # TODO:
- arxiv_id=None,
- jstor_id=None,
- )
-
def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]:
"""
Mapping datacite JSON to ReleaseEntity.
@@ -368,7 +191,7 @@ class DataciteImporter(EntityImporter):
print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
return False
- title = clean(title)
+ title = clean_str(title)
if not title:
print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
return False
@@ -387,7 +210,7 @@ class DataciteImporter(EntityImporter):
if not subtitle:
subtitle = None
else:
- subtitle = clean(subtitle)
+ subtitle = clean_str(subtitle)
# Dates. A few internal dates (registered, created, updated) and
# published (0..2554). We try to work with typed date list, in
@@ -445,15 +268,15 @@ class DataciteImporter(EntityImporter):
publisher = None
if publisher:
- publisher = clean(publisher)
+ publisher = clean_str(publisher)
# Container. For the moment, only ISSN as container.
container_id = None
container_name = None
container = attributes.get("container", {}) or {}
- if container.get("type") in CONTAINER_TYPE_MAP.keys():
- container_type = CONTAINER_TYPE_MAP.get(container["type"])
+ if container.get("type") in DATACITE_CONTAINER_TYPE_MAP.keys():
+ container_type = DATACITE_CONTAINER_TYPE_MAP.get(container["type"])
if container.get("identifier") and container.get("identifierType") == "ISSN":
issn = container.get("identifier")
if issn and len(issn) == 8:
@@ -506,10 +329,10 @@ class DataciteImporter(EntityImporter):
issue = container.get("issue")
if volume:
- volume = clean(volume)
+ volume = clean_str(volume)
if issue:
- issue = clean(issue)
+ issue = clean_str(issue)
# Pages.
pages = None
@@ -534,7 +357,7 @@ class DataciteImporter(EntityImporter):
license_extra = []
for lic in attributes.get("rightsList", []):
- slug = lookup_license_slug(lic.get("rightsUri"))
+ slug = datacite_lookup_license_slug(lic.get("rightsUri"))
if slug:
license_slug = slug
license_extra.append(lic)
@@ -594,7 +417,7 @@ class DataciteImporter(EntityImporter):
"[{}] language detection failed with {} on {}".format(doi, err, text),
file=sys.stderr,
)
- abstract_text = clean(text)
+ abstract_text = clean_str(text)
if not abstract_text:
continue
abstracts.append(
@@ -643,7 +466,13 @@ class DataciteImporter(EntityImporter):
if license_extra:
extra_datacite["license"] = license_extra
if attributes.get("subjects"):
- extra_datacite["subjects"] = attributes["subjects"]
+ # these subjects with schemeUri are too much metadata, which
+ # doesn't compress. filter them out.
+ extra_subjects = [
+ subj for subj in attributes["subjects"] if not subj.get("schemeUri")
+ ]
+ if extra_subjects:
+ extra_datacite["subjects"] = extra_subjects
# Include version information.
metadata_version = attributes.get("metadataVersion") or ""
@@ -706,8 +535,6 @@ class DataciteImporter(EntityImporter):
if release_month:
extra["release_month"] = release_month
- extids = self.lookup_ext_ids(doi=doi)
-
# Assemble release.
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
@@ -722,12 +549,6 @@ class DataciteImporter(EntityImporter):
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=doi,
- pmid=extids["pmid"],
- pmcid=extids["pmcid"],
- wikidata_qid=extids["wikidata_qid"],
- core=extids["core_id"],
- arxiv=extids["arxiv_id"],
- jstor=extids["jstor_id"],
),
contribs=contribs,
volume=volume,
@@ -922,14 +743,14 @@ class DataciteImporter(EntityImporter):
if len(affiliations) == 0:
raw_affiliation = None
else:
- raw_affiliation = clean(affiliations[0])
+ raw_affiliation = clean_str(affiliations[0])
name = c.get("name")
given_name = c.get("givenName")
surname = c.get("familyName")
if name:
- name = clean(name)
+ name = clean_str(name)
if not any((name, given_name, surname)):
continue
if not name:
@@ -943,8 +764,8 @@ class DataciteImporter(EntityImporter):
name = index_form_to_display_name(name)
if given_name:
- given_name = clean(given_name)
- surname = clean(surname)
+ given_name = clean_str(given_name)
+ surname = clean_str(surname)
# Perform a final assertion that name does not reduce to zero
# (e.g. whitespace only name).
@@ -1016,7 +837,7 @@ def contributor_list_contains_contributor(
return False
-def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
+def datacite_lookup_license_slug(raw: Optional[str]) -> Optional[str]:
"""
Resolve a variety of strings into a some pseudo-canonical form, e.g.
CC-BY-ND, CC-0, MIT and so on.
@@ -1111,12 +932,8 @@ def lookup_license_slug(raw: Optional[str]) -> Optional[str]:
return None
return "RS-{}".format(name.upper())
- # Fallback to mapped values.
- raw = raw.lower()
- raw = raw.strip().replace("http://", "//").replace("https://", "//")
- if not raw.endswith("/"):
- raw = raw + "/"
- return LICENSE_SLUG_MAP.get(raw)
+ # Fallback to generic license lookup
+ return lookup_license_slug(raw)
def find_original_language_title(
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index f5c886a2..92dbe574 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, ReleaseEntity
-from fatcat_tools.importers.common import EntityImporter
+from fatcat_tools.importers.common import MAX_ABSTRACT_LENGTH, EntityImporter
from fatcat_tools.normal import (
clean_doi,
clean_orcid,
@@ -24,9 +24,6 @@ from fatcat_tools.normal import (
parse_month,
)
-# Cutoff length for abstracts.
-MAX_ABSTRACT_LENGTH = 2048
-
class DoajArticleImporter(EntityImporter):
def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None:
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index e36e1b48..3c85132c 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -7,9 +7,9 @@ from typing import Any, Dict, List, Optional
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
-from .common import EntityImporter, clean, make_rel_url
+from fatcat_tools.normal import clean_doi, clean_str
-MAX_ABSTRACT_BYTES = 4096
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter, make_rel_url
class GrobidMetadataImporter(EntityImporter):
@@ -82,9 +82,9 @@ class GrobidMetadataImporter(EntityImporter):
extra_grobid: Dict[str, Any] = dict()
abstract = obj.get("abstract")
- if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
+ if abstract and len(abstract) < MAX_ABSTRACT_LENGTH and len(abstract) > 10:
abobj = fatcat_openapi_client.ReleaseAbstract(
- mimetype="text/plain", content=clean(obj.get("abstract"))
+ mimetype="text/plain", content=clean_str(obj.get("abstract"))
)
abstracts = [abobj]
else:
@@ -95,9 +95,9 @@ class GrobidMetadataImporter(EntityImporter):
contribs.append(
fatcat_openapi_client.ReleaseContrib(
index=i,
- raw_name=clean(a["name"]),
- given_name=clean(a.get("given_name")),
- surname=clean(a.get("surname")),
+ raw_name=clean_str(a["name"]),
+ given_name=clean_str(a.get("given_name")),
+ surname=clean_str(a.get("surname")),
role="author",
extra=None,
)
@@ -114,15 +114,15 @@ class GrobidMetadataImporter(EntityImporter):
pass
for key in ("volume", "url", "issue", "publisher"):
if raw.get(key):
- cite_extra[key] = clean(raw[key])
+ cite_extra[key] = clean_str(raw[key])
if raw.get("authors"):
- cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]]
+ cite_extra["authors"] = [clean_str(a["name"]) for a in raw["authors"]]
refs.append(
fatcat_openapi_client.ReleaseRef(
- key=clean(raw.get("id")),
+ key=clean_str(raw.get("id")),
year=year,
- title=clean(raw["title"]),
+ title=clean_str(raw["title"]),
extra=cite_extra or None,
)
)
@@ -133,11 +133,12 @@ class GrobidMetadataImporter(EntityImporter):
# only returns year, ever?
release_year = int(obj["date"][:4])
- extra = dict()
- if obj.get("doi"):
- extra["doi"] = obj["doi"]
+ extra: Dict[str, Any] = dict()
+ doi = clean_doi(obj.get("doi"))
+ if doi:
+ extra["doi"] = doi
if obj["journal"] and obj["journal"].get("name"):
- extra["container_name"] = clean(obj["journal"]["name"])
+ extra["container_name"] = clean_str(obj["journal"]["name"])
# TODO: ISSN/eISSN handling? or just journal name lookup?
@@ -146,7 +147,7 @@ class GrobidMetadataImporter(EntityImporter):
if self.longtail_oa:
extra["longtail_oa"] = True
- clean_title = clean(obj["title"], force_xml=True)
+ clean_title = clean_str(obj["title"], force_xml=True)
if not clean_title or len(clean_title) < 2:
return None
title = clean_title
@@ -158,9 +159,9 @@ class GrobidMetadataImporter(EntityImporter):
release_year=release_year,
contribs=contribs,
refs=refs,
- publisher=clean(obj["journal"].get("publisher")),
- volume=clean(obj["journal"].get("volume")),
- issue=clean(obj["journal"].get("issue")),
+ publisher=clean_str(obj["journal"].get("publisher")),
+ volume=clean_str(obj["journal"].get("volume")),
+ issue=clean_str(obj["journal"].get("issue")),
abstracts=abstracts or None,
ext_ids=fatcat_openapi_client.ReleaseExtIds(),
extra=extra or None,
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 2f10e533..9916a55f 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -1,5 +1,4 @@
import datetime
-import sqlite3
import sys
from typing import Any, Dict, List, Optional, Sequence
@@ -7,9 +6,9 @@ import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str, is_cjk
-from .common import DATE_FMT, EntityImporter, clean, is_cjk
+from .common import DATE_FMT, EntityImporter
# TODO: should be List[Tag] not List[Any] for full type annotations
@@ -37,13 +36,13 @@ def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]:
for raw in raw_persons:
name = raw.find("name") or None
if name:
- name = clean(name.get_text().replace("\n", " "))
+ name = clean_str(name.get_text().replace("\n", " "))
surname = raw.find("familyName") or None
if surname:
- surname = clean(surname.get_text().replace("\n", " "))
+ surname = clean_str(surname.get_text().replace("\n", " "))
given_name = raw.find("givenName") or None
if given_name:
- given_name = clean(given_name.get_text().replace("\n", " "))
+ given_name = clean_str(given_name.get_text().replace("\n", " "))
lang = "en"
if is_cjk(name):
lang = "ja"
@@ -117,50 +116,8 @@ class JalcImporter(EntityImporter):
)
self.create_containers = kwargs.get("create_containers", True)
- extid_map_file = kwargs.get("extid_map_file")
- self.extid_map_db = None
- if extid_map_file:
- db_uri = "file:{}?mode=ro".format(extid_map_file)
- print("Using external ID map: {}".format(db_uri))
- self.extid_map_db = sqlite3.connect(db_uri, uri=True)
- else:
- print("Not using external ID map")
-
self.read_issn_map_file(issn_map_file)
- def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
- if self.extid_map_db is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
- ).fetchone()
- if row is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = [str(cell or "") or None for cell in row]
- return dict(
- core_id=row[0],
- pmid=row[1],
- pmcid=row[2],
- wikidata_qid=row[3],
- # TODO:
- arxiv_id=None,
- jstor_id=None,
- )
-
def want(self, raw_record: Any) -> bool:
return True
@@ -273,16 +230,16 @@ class JalcImporter(EntityImporter):
for p in record.find_all("publicationName")
if p.get_text()
]
- pubs = [clean(p) for p in pubs if p]
+ pubs = [clean_str(p) for p in pubs if p]
assert pubs
if len(pubs) > 1 and pubs[0] == pubs[1]:
pubs = [pubs[0]]
if len(pubs) > 1 and is_cjk(pubs[0]):
# eng/jpn ordering is not reliable
pubs = [pubs[1], pubs[0]]
- container_name = clean(pubs[0])
+ container_name = clean_str(pubs[0])
if len(pubs) > 1:
- container_extra["original_name"] = clean(pubs[1])
+ container_extra["original_name"] = clean_str(pubs[1])
if record.publisher:
pubs = [
@@ -297,7 +254,7 @@ class JalcImporter(EntityImporter):
# ordering is not reliable
pubs = [pubs[1], pubs[0]]
if pubs:
- publisher = clean(pubs[0])
+ publisher = clean_str(pubs[0])
if len(pubs) > 1:
container_extra["publisher_aliases"] = pubs[1:]
@@ -330,9 +287,6 @@ class JalcImporter(EntityImporter):
# reasonable default for this collection
release_type = "article-journal"
- # external identifiers
- extids = self.lookup_ext_ids(doi=doi)
-
# extra:
# translation_of
# aliases
@@ -342,26 +296,20 @@ class JalcImporter(EntityImporter):
# (informally)
extra["jalc"] = extra_jalc
- title = clean(title)
+ title = clean_str(title)
if not title:
return None
re = ReleaseEntity(
work_id=None,
title=title,
- original_title=clean(original_title),
+ original_title=clean_str(original_title),
release_type=release_type,
release_stage="published",
release_date=release_date,
release_year=release_year,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=doi,
- pmid=extids["pmid"],
- pmcid=extids["pmcid"],
- wikidata_qid=extids["wikidata_qid"],
- core=extids["core_id"],
- arxiv=extids["arxiv_id"],
- jstor=extids["jstor_id"],
),
volume=volume,
issue=issue,
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index a45e49f3..fc1dfcbd 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, ContainerEntity
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
def or_none(s: Optional[str]) -> Optional[str]:
@@ -105,7 +107,7 @@ class JournalMetadataImporter(EntityImporter):
if extra_ia:
extra["ia"] = extra_ia
- name = clean(row.get("name"))
+ name = clean_str(row.get("name"))
if not name:
return None
@@ -115,7 +117,7 @@ class JournalMetadataImporter(EntityImporter):
issnp=row.get("issnp"),
container_type=None, # TODO
name=name,
- publisher=clean(row.get("publisher")),
+ publisher=clean_str(row.get("publisher")),
wikidata_qid=None, # TODO
extra=extra,
)
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index 2c8aa0a4..79691c9a 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -8,7 +8,10 @@ import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseEntity
-from .common import LANG_MAP_MARC, EntityImporter, clean
+from fatcat_tools.biblio_lookup_tables import LANG_MAP_MARC
+from fatcat_tools.normal import clean_doi, clean_str
+
+from .common import EntityImporter
from .crossref import CONTAINER_TYPE_MAP
# TODO: more entries?
@@ -138,7 +141,7 @@ class JstorImporter(EntityImporter):
issnl=issnl,
publisher=publisher,
container_type=self.map_container_type(release_type),
- name=clean(journal_title, force_xml=True),
+ name=clean_str(journal_title, force_xml=True),
)
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
@@ -146,7 +149,9 @@ class JstorImporter(EntityImporter):
doi = article_meta.find("article-id", {"pub-id-type": "doi"})
if doi:
- doi = doi.string.lower().strip()
+ doi = clean_doi(doi.string.lower())
+ else:
+ doi = None
jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"})
if jstor_id:
@@ -162,13 +167,13 @@ class JstorImporter(EntityImporter):
for c in cgroup.find_all("contrib"):
given = c.find("given-names")
if given:
- given = clean(given.get_text().replace("\n", " "))
+ given = clean_str(given.get_text().replace("\n", " "))
surname = c.find("surname")
if surname:
- surname = clean(surname.get_text().replace("\n", " "))
+ surname = clean_str(surname.get_text().replace("\n", " "))
raw_name = c.find("string-name")
if raw_name:
- raw_name = clean(raw_name.get_text().replace("\n", " "))
+ raw_name = clean_str(raw_name.get_text().replace("\n", " "))
if not raw_name:
if given and surname:
@@ -230,7 +235,7 @@ class JstorImporter(EntityImporter):
# JSTOR issue-id
if article_meta.find("issue-id"):
- issue_id = clean(article_meta.find("issue-id").string)
+ issue_id = clean_str(article_meta.find("issue-id").string)
if issue_id:
extra_jstor["issue_id"] = issue_id
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 430cdd0f..f3d82a86 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -4,7 +4,9 @@ from typing import Any, Dict, List, Optional
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, CreatorEntity
-from .common import EntityImporter, clean
+from fatcat_tools.normal import clean_str
+
+from .common import EntityImporter
def value_or_none(e: Any) -> Any:
@@ -65,14 +67,14 @@ class OrcidImporter(EntityImporter):
if not self.is_orcid(orcid):
sys.stderr.write("Bad ORCID: {}\n".format(orcid))
return None
- display = clean(display)
+ display = clean_str(display)
if not display:
# must have *some* name
return None
ce = CreatorEntity(
orcid=orcid,
- given_name=clean(given),
- surname=clean(sur),
+ given_name=clean_str(given),
+ surname=clean_str(sur),
display_name=display,
extra=extra,
)
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 1cdb450b..a6c7409d 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -8,317 +8,15 @@ import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseEntity
-from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid
-
-from .common import LANG_MAP_MARC, EntityImporter, clean
-
-# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
-PUBMED_RELEASE_TYPE_MAP = {
- # Adaptive Clinical Trial
- "Address": "speech",
- "Autobiography": "book",
- # Bibliography
- "Biography": "book",
- # Case Reports
- "Classical Article": "article-journal",
- # Clinical Conference
- # Clinical Study
- # Clinical Trial
- # Clinical Trial, Phase I
- # Clinical Trial, Phase II
- # Clinical Trial, Phase III
- # Clinical Trial, Phase IV
- # Clinical Trial Protocol
- # Clinical Trial, Veterinary
- # Collected Works
- # Comparative Study
- # Congress
- # Consensus Development Conference
- # Consensus Development Conference, NIH
- # Controlled Clinical Trial
- "Dataset": "dataset",
- # Dictionary
- # Directory
- # Duplicate Publication
- "Editorial": "editorial",
- # English Abstract # doesn't indicate that this is abstract-only
- # Equivalence Trial
- # Evaluation Studies
- # Expression of Concern
- # Festschrift
- # Government Document
- # Guideline
- "Historical Article": "article-journal",
- # Interactive Tutorial
- "Interview": "interview",
- "Introductory Journal Article": "article-journal",
- "Journal Article": "article-journal",
- "Lecture": "speech",
- "Legal Case": "legal_case",
- "Legislation": "legislation",
- "Letter": "letter",
- # Meta-Analysis
- # Multicenter Study
- # News
- "Newspaper Article": "article-newspaper",
- # Observational Study
- # Observational Study, Veterinary
- # Overall
- # Patient Education Handout
- # Periodical Index
- # Personal Narrative
- # Portrait
- # Practice Guideline
- # Pragmatic Clinical Trial
- # Publication Components
- # Publication Formats
- # Publication Type Category
- # Randomized Controlled Trial
- # Research Support, American Recovery and Reinvestment Act
- # Research Support, N.I.H., Extramural
- # Research Support, N.I.H., Intramural
- # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
- # Research Support, U.S. Gov't, P.H.S.
- # Review # in the "literature review" sense, not "product review"
- # Scientific Integrity Review
- # Study Characteristics
- # Support of Research
- # Systematic Review
- "Technical Report": "report",
- # Twin Study
- # Validation Studies
- # Video-Audio Media
- # Webcasts
-}
-
-MONTH_ABBR_MAP = {
- "Jan": 1,
- "01": 1,
- "Feb": 2,
- "02": 2,
- "Mar": 3,
- "03": 3,
- "Apr": 4,
- "04": 4,
- "May": 5,
- "05": 5,
- "Jun": 6,
- "06": 6,
- "Jul": 7,
- "07": 7,
- "Aug": 8,
- "08": 8,
- "Sep": 9,
- "09": 9,
- "Oct": 10,
- "10": 10,
- "Nov": 11,
- "11": 11,
- "Dec": 12,
- "12": 12,
-}
-
-# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
-COUNTRY_NAME_MAP = {
- "Afghanistan": "af",
- "Albania": "al",
- "Algeria": "dz",
- "Andorra": "ad",
- "Angola": "ao",
- "Antigua and Barbuda": "ag",
- "Argentina": "ar",
- "Armenia": "am",
- "Australia": "au",
- "Austria": "at",
- "Azerbaijan": "az",
- "Bahamas": "bs",
- "Bahrain": "bh",
- "Bangladesh": "bd",
- "Barbados": "bb",
- "Belarus": "by",
- "Belgium": "be",
- "Belize": "bz",
- "Benin": "bj",
- "Bhutan": "bt",
- "Bolivia": "bo",
- "Bosnia and Herzegowina": "ba",
- "Botswana": "bw",
- "Brazil": "br",
- "Brunei Darussalam": "bn",
- "Bulgaria": "bg",
- "Burkina Faso": "bf",
- "Burundi": "bi",
- "Cambodia": "kh",
- "Cameroon": "cm",
- "Canada": "ca",
- "Cape Verde": "cv",
- "Central African Republic": "cf",
- "Chad": "td",
- "Chile": "cl",
- "China": "cn",
- "Colombia": "co",
- "Comoros": "km",
- "Congo, Democratic Republic": "cd",
- "Congo, People’s Republic": "cg",
- "Costa Rica": "cr",
- "Cote d'Ivoire": "ci",
- "Croatia (Local Name: Hrvatska)": "hr",
- "Cuba": "cu",
- "Cyprus": "cy",
- "Czech Republic": "cz",
- "Denmark": "dk",
- "Djibouti": "dj",
- "Dominica": "dm",
- "Dominican Republic": "do",
- "East Timor": "tl",
- "Ecuador": "ec",
- "El Salvador": "sv",
- "Equatorial Guinea": "gq",
- "Eritrea": "er",
- "Estonia": "ee",
- "Ethiopia": "et",
- "Fiji": "fj",
- "Finland": "fi",
- "France": "fr",
- "Gabon": "ga",
- "Gambia": "gm",
- "Georgia": "ge",
- "Germany": "de",
- "Ghana": "gh",
- "Greece": "gr",
- "Greenland": "gl",
- "Grenada": "gd",
- "Guatemala": "gt",
- "Guinea": "gn",
- "Guinea-Bissau": "gw",
- "Guyana": "gy",
- "Haiti": "ht",
- "Honduras": "hn",
- "Hong Kong": "hk",
- "Hungary": "hu",
- "Iceland": "is",
- "India": "in",
- "Indonesia": "id",
- "Iran": "ir",
- "Iraq": "iq",
- "Ireland": "ie",
- "Israel": "il",
- "Italy": "it",
- "Jamaica": "jm",
- "Japan": "jp",
- "Jordan": "jo",
- "Kazakhstan": "kz",
- "Kenya": "ke",
- "Kiribati": "ki",
- "Korea, Democratic People's Republic": "kp",
- "Korea, Republic": "kr",
- "Kuwait": "kw",
- "Kyrgyzstan": "kg",
- "Laos": "la",
- "Latvia": "lv",
- "Lebanon": "lb",
- "Lesotho": "ls",
- "Liberia": "lr",
- "Libya": "ly",
- "Liechtenstein": "li",
- "Lithuania": "lt",
- "Luxembourg": "lu",
- "Macedonia": "mk",
- "Madagascar": "mg",
- "Malawi": "mw",
- "Malaysia": "my",
- "Maldives": "mv",
- "Mali": "ml",
- "Malta": "mt",
- "Marshall Islands": "mh",
- "Mauritania": "mr",
- "Mauritius": "mu",
- "Mexico": "mx",
- "Micronesia": "fm",
- "Moldova": "md",
- "Monaco": "mc",
- "Mongolia": "mn",
- "Morocco": "ma",
- "Mozambique": "mz",
- "Myanmar": "mm",
- "Namibia": "na",
- "Nauru": "nr",
- "Nepal": "np",
- "Netherlands": "nl",
- "New Zealand": "nz",
- "Nicaragua": "ni",
- "Niger": "ne",
- "Nigeria": "ng",
- "Norway": "no",
- "Oman": "om",
- "Pakistan": "pk",
- "Palau": "pw",
- "Panama": "pa",
- "Papua New Guinea": "pg",
- "Paraguay": "py",
- "Peru": "pe",
- "Philippines": "ph",
- "Poland": "pl",
- "Portugal": "pt",
- "Puerto Rico": "pr",
- "Qatar": "qa",
- "Romania": "ro",
- "Russian Federation": "ru",
- "Rwanda": "rw",
- "Saint Kitts and Nevis": "kn",
- "Saint Lucia": "lc",
- "Saint Vincent and the Grenadines": "vc",
- "Samoa": "ws",
- "San Marino": "sm",
- "Sao Tome and Príncipe": "st",
- "Saudi Arabia": "sa",
- "Senegal": "sn",
- "Serbia and Montenegro": "cs",
- "Seychelles": "sc",
- "Sierra Leone": "sl",
- "Singapore": "sg",
- "Slovakia (Slovak Republic)": "sk",
- "Slovenia": "si",
- "Solomon Islands": "sb",
- "Somalia": "so",
- "South Africa": "za",
- "Spain": "es",
- "Sri Lanka": "lk",
- "Sudan": "sd",
- "Suriname": "sr",
- "Swaziland": "sz",
- "Sweden": "se",
- "Switzerland": "ch",
- "Syrian Arab Republic": "sy",
- "Taiwan": "tw",
- "Tajikistan": "tj",
- "Tanzania": "tz",
- "Tanzania": "tz",
- "Thailand": "th",
- "Togo": "tg",
- "Tonga": "to",
- "Trinidad and Tobago": "tt",
- "Tunisia": "tn",
- "Turkey": "tr",
- "Turkmenistan": "tm",
- "Tuvalu": "tv",
- "Uganda": "ug",
- "Ukraine": "ua",
- "United Arab Emirates": "ae",
- "United Kingdom": "gb",
- "United States": "us",
- "Uruguay": "uy",
- # Additions from running over large files
- "Bosnia and Herzegovina": "ba",
- # "International"
- "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn
- "Russia (Federation)": "ru",
- "Scotland": "gb",
- "England": "gb",
- "Korea (South)": "kr",
- "Georgia (Republic)": "ge",
- "Egypt": "eg",
-}
+from fatcat_tools.biblio_lookup_tables import (
+ COUNTRY_NAME_MAP,
+ LANG_MAP_MARC,
+ MONTH_ABBR_MAP,
+ PUBMED_RELEASE_TYPE_MAP,
+)
+from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid, clean_str
+
+from .common import EntityImporter
class PubmedImporter(EntityImporter):
@@ -704,14 +402,14 @@ class PubmedImporter(EntityImporter):
if extra_pubmed:
extra["pubmed"] = extra_pubmed
- title = clean(title)
+ title = clean_str(title)
if not title:
return None
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
title=title,
- original_title=clean(original_title),
+ original_title=clean_str(original_title),
release_type=release_type,
release_stage=release_stage,
release_date=release_date,
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
deleted file mode 100755
index 5caed2c7..00000000
--- a/python/fatcat_tools/importers/wayback_static.py
+++ /dev/null
@@ -1,287 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Helpers to create Web Capture entities from extracted wayback content.
-
-Works as a stand-alone script (for debugging) or as library routines.
-"""
-
-import argparse
-import datetime
-import hashlib
-import json
-import subprocess
-import sys
-from typing import Any, Dict, List, Optional, Tuple
-
-import requests
-from bs4 import BeautifulSoup
-from fatcat_openapi_client import (
- ApiClient,
- Editgroup,
- EntityEdit,
- WebcaptureCdxLine,
- WebcaptureEntity,
- WebcaptureUrl,
-)
-
-from .common import b32_hex
-
-CDX_API_BASE = "https://web.archive.org/cdx/search/cdx"
-GWB_URL_BASE = "https://web.archive.org/web"
-REQ_SESSION = requests.Session()
-
-
-def parse_wbm_url(url: str) -> Tuple[str, datetime.datetime, str]:
- """Takes a wayback machine URL, and returns a tuple:
-
- (timestamp, datetime, original_url)
- """
- chunks = url.split("/")
- assert len(chunks) >= 6
- assert chunks[2] == "web.archive.org"
- assert chunks[3] == "web"
- return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:]))
-
-
-def test_parse_wbm_url() -> None:
- u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html"
- assert parse_wbm_url(u) == (
- "20010712114837",
- datetime.datetime(2001, 7, 12, 11, 48, 37),
- "http://www.dlib.org/dlib/june01/reich/06reich.html",
- )
-
-
-def parse_wbm_timestamp(timestamp: str) -> datetime.datetime:
- """
- Takes a complete WBM timestamp string (like "20020327115625") and returns a
- python datetime object (UTC)
- """
- # strip any "im_" or "id_" suffix
- if timestamp.endswith("_"):
- timestamp = timestamp[:-3]
- # inflexible; require the full second-precision timestamp
- assert len(timestamp) == 14
- return datetime.datetime(
- year=int(timestamp[0:4]),
- month=int(timestamp[4:6]),
- day=int(timestamp[6:8]),
- hour=int(timestamp[8:10]),
- minute=int(timestamp[10:12]),
- second=int(timestamp[12:14]),
- )
-
-
-def test_parse_wbm_timestamp() -> None:
- assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37)
-
-
-def fetch_wbm(url: str) -> bytes:
- resp = REQ_SESSION.get(url)
- resp.raise_for_status()
- assert resp.content
- return resp.content
-
-
-def lookup_cdx(
- embed_url: str, verify_hashes: bool = True, cdx_output: Any = None
-) -> Optional[WebcaptureCdxLine]:
- sys.stderr.write(embed_url + "\n")
- assert embed_url.startswith("/web/")
- embed_url_segments = embed_url.split("/")
- timestamp = embed_url_segments[2]
- if timestamp.endswith("_"):
- timestamp = timestamp[:-3]
- url = "/".join(embed_url_segments[3:])
- # print((timestamp, url))
- params: Dict = dict(
- url=url,
- closest=timestamp,
- sort="closest",
- resolveRevisits="true",
- matchType="exact",
- limit=1,
- )
- resp = REQ_SESSION.get(
- CDX_API_BASE,
- params=params,
- )
- resp.raise_for_status()
- # print(resp.url)
- if resp.content:
- hit = resp.content.decode("utf-8").split("\n")[0]
- if cdx_output:
- cdx_output.write(hit + "\n")
- cdx_chunks = hit.split(" ")
- cdx = [x if (x and x != "-") else None for x in cdx_chunks]
- webcapture_cdx = WebcaptureCdxLine(
- surt=cdx[0],
- timestamp=parse_wbm_timestamp(cdx[1] or "").isoformat() + "Z",
- url=cdx[2],
- mimetype=cdx[3],
- status_code=int(cdx[4] or ""),
- sha1=b32_hex(cdx[5] or ""),
- sha256=None,
- )
- if verify_hashes:
- resp = REQ_SESSION.get(
- GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url) # raw timestamp
- )
- resp.raise_for_status()
- assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex()
- webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex()
- webcapture_cdx.size = len(resp.content)
- return webcapture_cdx
- else:
- return None
-
-
-def wayback_url_to_relative(url: str) -> Optional[str]:
- """
- Wayback URLs can be relative or absolute in rewritten documents. This
- function converts any form of rewritten URL to a relative (to
- web.archive.org) one, or returns None if it isn't a rewritten URL at all.
- """
- if url.startswith("https://web.archive.org/"):
- url = url[23:]
- elif url.startswith("http://web.archive.org/"):
- url = url[22:]
-
- if url.startswith("/web/"):
- return url
- else:
- return None
-
-
-def extract_embeds(soup: BeautifulSoup) -> List[str]:
-
- embeds = set()
-
- # <link href="">
- for tag in soup.find_all("link", href=True):
- if tag["rel"] not in ("stylesheet",):
- continue
- url = wayback_url_to_relative(tag["href"])
- if url:
- embeds.add(url)
- # <img src="">
- for tag in soup.find_all("img", src=True):
- url = wayback_url_to_relative(tag["src"])
- if url:
- embeds.add(url)
-
- # <script src="">
- for tag in soup.find_all("script", src=True):
- url = wayback_url_to_relative(tag["src"])
- if url:
- embeds.add(url)
-
- return list(embeds)
-
-
-def static_wayback_webcapture(wayback_url: str, cdx_output: Any = None) -> WebcaptureEntity:
- """
- Given a complete wayback machine capture URL, like:
-
- http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html
-
- Will return a new ("bare") fatcat webcapture entity python object, with all
- the CDX entries filled in.
- """
-
- wbm_html = fetch_wbm(wayback_url)
- raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
- # with open(rewritten_path, 'r') as fp:
- # soup = BeautifulSoup(fp, "lxml")
- soup = BeautifulSoup(wbm_html, "lxml")
- embeds = extract_embeds(soup)
- cdx_obj = lookup_cdx(
- "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output
- )
- cdx_list = [cdx_obj]
- for url in embeds:
- cdx_obj = lookup_cdx(url, cdx_output=cdx_output)
- cdx_list.append(cdx_obj)
- archive_urls = [
- WebcaptureUrl(
- rel="wayback",
- url="https://web.archive.org/web/",
- )
- ]
- wc = WebcaptureEntity(
- cdx=cdx_list,
- timestamp=timestamp.isoformat() + "Z",
- original_url=original_url,
- archive_urls=archive_urls,
- release_ids=None,
- )
- return wc
-
-
-def auto_wayback_static(
- api: ApiClient, release_id: str, wayback_url: str, editgroup_id: Optional[str] = None
-) -> Tuple[Optional[str], Optional[EntityEdit]]:
- """
- Returns a tuple: (editgroup_id, edit). If failed, both are None
- """
-
- raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
- git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
-
- release = api.get_release(release_id, expand="webcaptures")
-
- # check for existing webcapture with same parameters
- for wc in release.webcaptures:
- if wc.original_url == original_url and wc.timestamp.date() == timestamp.date():
- # skipping: already existed
- print(
- "release {} already had webcapture {} {}".format(
- release_id, raw_timestamp, original_url
- )
- )
- return (None, None)
-
- wc = static_wayback_webcapture(wayback_url)
- assert len(wc.cdx) >= 1
- wc.release_ids = [release_id]
- if not editgroup_id:
- eg = api.create_editgroup(
- Editgroup(
- description="One-off import of static web content from wayback machine",
- extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"),
- )
- )
- editgroup_id = eg.editgroup_id
- edit = api.create_webcapture(eg.editgroup_id, wc)
- return (editgroup_id, edit)
-
-
-def main() -> None:
- parser = argparse.ArgumentParser()
- parser.add_argument("--verbose", action="store_true", help="verbose output")
- parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from")
- parser.add_argument(
- "--json-output",
- type=argparse.FileType("w"),
- default=sys.stdout,
- help="where to write out webcapture entity (as JSON)",
- )
- parser.add_argument(
- "--cdx-output",
- type=argparse.FileType("w"),
- default=None,
- help="(optional) file to write out CDX stub",
- )
-
- args = parser.parse_args()
-
- # entity-to-JSON code; duplicate of entity_to_dict()
- api_client = ApiClient()
- wc = static_wayback_webcapture(args.wayback_url, cdx_output=args.cdx_output)
- wc_dict = api_client.sanitize_for_serialization(wc)
- print(json.dumps(wc_dict))
-
-
-if __name__ == "__main__":
- main()