diff options
author | bnewbold <bnewbold@archive.org> | 2021-11-11 01:12:18 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2021-11-11 01:12:18 +0000 |
commit | 6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch) | |
tree | 1b80344125152b46ae727dc8bbff73cc12abfd3e | |
parent | 7e3f91f1a49ea85707cae31125021ba761f5373d (diff) | |
parent | 6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff) | |
download | fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip |
Merge branch 'bnewbold-import-refactors' into 'master'
import refactors and deprecations
Some of these are from old stale branches (the datacite subject metadata patch), but most are from yesterday and today. Sort of a hodge-podge, but the general theme is getting around to deferred cleanups and refactors specific to importer code before making some behavioral changes.
The Datacite-specific stuff could use review here.
Remove unused/deprecated/dead code:
- cdl_dash_dat and wayback_static importers, which were for specific early example entities and have been superseded by other importers
- "extid map" sqlite3 feature from several importers, was only used for initial bulk imports (and maybe should not have been used)
Refactors:
- moved a number of large datastructures out of importer code and into a dedicated static file (`biblio_lookup_tables.py`). Didn't move all, just the ones that were either generic or very large (making it hard to read code)
- shuffled around relative imports and some function names ("clean_str" vs. "clean")
Some actual behavioral changes:
- remove some Datacite-specific license slugs
- stop trying to fix double-slashes in DOIs, that was causing more harm than help (some DOIs do actually have double-slashes!)
- remove some excess metadata from datacite 'extra' fields
27 files changed, 874 insertions, 1599 deletions
diff --git a/notes/cleanups/double_slash_dois.md b/notes/cleanups/double_slash_dois.md new file mode 100644 index 00000000..d4e9ded6 --- /dev/null +++ b/notes/cleanups/double_slash_dois.md @@ -0,0 +1,46 @@ + +Relevant github issue: https://github.com/internetarchive/fatcat/issues/48 + + +## Investigate + +At least some of these DOIs actually seem valid, like +`10.1026//1616-1041.3.2.86`. So shouldn't be re-writing them! + + zcat release_extid.tsv.gz \ + | cut -f1,3 \ + | rg '\t10\.\d+//' \ + | wc -l + # 59,904 + + zcat release_extid.tsv.gz \ + | cut -f1,3 \ + | rg '\t10\.\d+//' \ + | pv -l \ + > doubleslash_dois.tsv + +Which prefixes have the most double slashes? + + cat doubleslash_dois.tsv | cut -f2 | cut -d/ -f1 | sort | uniq -c | sort -nr | head + 51220 10.1037 + 2187 10.1026 + 1316 10.1024 + 826 10.1027 + 823 10.14505 + 443 10.17010 + 186 10.46925 + 163 10.37473 + 122 10.18376 + 118 10.29392 + [...] + +All of the 10.1037 DOIs seem to be registered with Crossref, and at least some +have redirects to the not-with-double-slash versions. Not all doi.org lookups +include a redirect. + +I think the "correct thing to do" here is to add special-case handling for the +pubmed and crossref importers, and in any other case allow double slashes. + +Not clear that there are any specific cleanups to be done for now. A broader +"verify that DOIs are actually valid" push and cleanup would make sense; if +that happens checking for mangled double-slash DOIs would make sense. diff --git a/python/README_import.md b/python/README_import.md index 6853a4d7..74e75e14 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -52,6 +52,7 @@ Usually tens of minutes on fast production machine. Usually 24 hours or so on fast production machine. + # NOTE: `--extid-map-file` was used during initial import, but is now deprecated time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 ## JALC @@ -59,6 +60,7 @@ Usually 24 hours or so on fast production machine. First import a random subset single threaded to create (most) containers. On a fast machine, this takes a couple minutes. + # NOTE: `--extid-map-file` was used during initial import, but is now deprecated time ./fatcat_import.py jalc /srv/fatcat/datasets/JALC-LOD-20180907.sample10k.rdf /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 Then, in parallel: @@ -116,6 +118,7 @@ Prep JSON files from sqlite (for parallel import): Run import in parallel: + # NOTE: `--extid-map-file` was used during initial import, but is now deprecated export FATCAT_AUTH_WORKER_CRAWL=... zcat /srv/fatcat/datasets/s2_doi.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py arabesque --json-file - --extid-type doi --crawl-id DIRECT-OA-CRAWL-2019 --no-require-grobid diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 445acde8..33679868 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -42,8 +42,6 @@ from fatcat_tools.importers import ( SavePaperNowWebImporter, ShadowLibraryImporter, SqlitePusher, - auto_cdl_dash_dat, - auto_wayback_static, ) # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable @@ -54,7 +52,6 @@ def run_crossref(args: argparse.Namespace) -> None: fci = CrossrefImporter( args.api, args.issn_map_file, - extid_map_file=args.extid_map_file, edit_batch_size=args.batch_size, bezerk_mode=args.bezerk_mode, ) @@ -72,7 +69,7 @@ def run_crossref(args: argparse.Namespace) -> None: def run_jalc(args: argparse.Namespace) -> None: - ji = JalcImporter(args.api, args.issn_map_file, extid_map_file=args.extid_map_file) + ji = JalcImporter(args.api, args.issn_map_file) Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run() @@ -316,53 +313,6 @@ def run_shadow_lib(args: argparse.Namespace) -> None: JsonLinePusher(fmi, args.json_file).run() -def run_wayback_static(args: argparse.Namespace) -> None: - api = args.api - - # find the release - if args.release_id: - release_id = args.release_id - elif args.extid: - idtype = args.extid.split(":")[0] - extid = ":".join(args.extid.split(":")[1:]) - if idtype == "doi": - release_id = api.lookup_release(doi=extid).ident - elif idtype == "pmid": - release_id = api.lookup_release(pmid=extid).ident - elif idtype == "wikidata": - release_id = api.lookup_release(wikidata_qid=extid).ident - else: - raise NotImplementedError("extid type: {}".format(idtype)) - else: - raise Exception("need either release_id or extid argument") - - # create it - (editgroup_id, wc) = auto_wayback_static( - api, release_id, args.wayback_url, editgroup_id=args.editgroup_id - ) - if not wc: - return - print("release_id: {}".format(release_id)) - print("editgroup_id: {}".format(editgroup_id)) - print("webcapture id: {}".format(wc.ident)) - print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident)) - - -def run_cdl_dash_dat(args: argparse.Namespace) -> None: - api = args.api - - # create it - (editgroup_id, release, fs) = auto_cdl_dash_dat( - api, args.dat_path, release_id=args.release_id, editgroup_id=args.editgroup_id - ) - if not (fs and release): - return - print("release_id: {}".format(release.ident)) - print("editgroup_id: {}".format(editgroup_id)) - print("fileset id: {}".format(fs.ident)) - print("link: https://fatcat.wiki/fileset/{}".format(fs.ident)) - - def run_datacite(args: argparse.Namespace) -> None: dci = DataciteImporter( args.api, @@ -370,7 +320,6 @@ def run_datacite(args: argparse.Namespace) -> None: edit_batch_size=args.batch_size, bezerk_mode=args.bezerk_mode, debug=args.debug, - extid_map_file=args.extid_map_file, insert_log_file=args.insert_log_file, ) if args.kafka_mode: @@ -495,12 +444,6 @@ def main() -> None: type=argparse.FileType("r"), ) sub_crossref.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) - sub_crossref.add_argument( "--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)" ) sub_crossref.add_argument( @@ -529,12 +472,6 @@ def main() -> None: default=None, type=argparse.FileType("r"), ) - sub_jalc.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) sub_arxiv = subparsers.add_parser("arxiv", help="import arxiv.org metadata from XML files") sub_arxiv.set_defaults( @@ -913,43 +850,6 @@ def main() -> None: type=argparse.FileType("r"), ) - sub_wayback_static = subparsers.add_parser( - "wayback-static", help="crude crawl+ingest tool for single-page HTML docs from wayback" - ) - sub_wayback_static.set_defaults( - func=run_wayback_static, - auth_var="FATCAT_API_AUTH_TOKEN", - ) - sub_wayback_static.add_argument( - "wayback_url", type=str, help="URL of wayback capture to extract from" - ) - sub_wayback_static.add_argument( - "--extid", type=str, help="external identifier for release lookup" - ) - sub_wayback_static.add_argument("--release-id", type=str, help="release entity identifier") - sub_wayback_static.add_argument( - "--editgroup-id", - type=str, - help="use existing editgroup (instead of creating a new one)", - ) - - sub_cdl_dash_dat = subparsers.add_parser( - "cdl-dash-dat", help="crude helper to import datasets from Dat/CDL mirror pilot project" - ) - sub_cdl_dash_dat.set_defaults( - func=run_cdl_dash_dat, - auth_var="FATCAT_API_AUTH_TOKEN", - ) - sub_cdl_dash_dat.add_argument( - "dat_path", type=str, help="local path dat to import (must be the dat discovery key)" - ) - sub_cdl_dash_dat.add_argument("--release-id", type=str, help="release entity identifier") - sub_cdl_dash_dat.add_argument( - "--editgroup-id", - type=str, - help="use existing editgroup (instead of creating a new one)", - ) - sub_datacite = subparsers.add_parser("datacite", help="import datacite.org metadata") sub_datacite.add_argument( "json_file", @@ -964,12 +864,6 @@ def main() -> None: type=argparse.FileType("r"), ) sub_datacite.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) - sub_datacite.add_argument( "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" ) sub_datacite.add_argument( diff --git a/python/fatcat_tools/biblio_lookup_tables.py b/python/fatcat_tools/biblio_lookup_tables.py new file mode 100644 index 00000000..edb1f5ef --- /dev/null +++ b/python/fatcat_tools/biblio_lookup_tables.py @@ -0,0 +1,626 @@ +""" +This file contains lookup tables and other static data structures used in +bibliographic metadata munging. +""" + +from typing import Dict, Optional + +# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of +# 2/T and 2/B? +# PubMed/MEDLINE and JSTOR use these MARC codes +# https://www.loc.gov/marc/languages/language_name.html +LANG_MAP_MARC: Dict[str, Optional[str]] = { + "afr": "af", + "alb": "sq", + "amh": "am", + "ara": "ar", + "arm": "hy", + "aze": "az", + "ben": "bn", + "bos": "bs", + "bul": "bg", + "cat": "ca", + "chi": "zh", + "cze": "cs", + "dan": "da", + "dut": "nl", + "eng": "en", + "epo": "eo", + "est": "et", + "fin": "fi", + "fre": "fr", + "geo": "ka", + "ger": "de", + "gla": "gd", + "gre": "el", + "heb": "he", + "hin": "hi", + "hrv": "hr", + "hun": "hu", + "ice": "is", + "ind": "id", + "ita": "it", + "jpn": "ja", + "kin": "rw", + "kor": "ko", + "lat": "la", + "lav": "lv", + "lit": "lt", + "mac": "mk", + "mal": "ml", + "mao": "mi", + "may": "ms", + "nor": "no", + "per": "fa", + "per": "fa", + "pol": "pl", + "por": "pt", + "pus": "ps", + "rum": "ro", + "rus": "ru", + "san": "sa", + "slo": "sk", + "slv": "sl", + "spa": "es", + "srp": "sr", + "swe": "sv", + "tha": "th", + "tur": "tr", + "ukr": "uk", + "urd": "ur", + "vie": "vi", + "wel": "cy", + # additions + "gle": "ga", # "Irish" (Gaelic) + "jav": "jv", # Javanese + "welsh": "cy", # Welsh + "oci": "oc", # Occitan + # Don't have ISO 639-1 codes + "grc": "el", # Ancient Greek; map to modern greek + "map": None, # Austronesian (collection) + "syr": None, # Syriac, Modern + "gem": None, # Old Saxon + "non": None, # Old Norse + "emg": None, # Eastern Meohang + "neg": None, # Negidal + "mul": None, # Multiple languages + "und": None, # Undetermined +} + +# these are mappings from web domains to URL 'rel' for things like file entity +# URL notation +DOMAIN_REL_MAP: Dict[str, str] = { + "archive.org": "archive", + # LOCKSS, Portico, DuraSpace, etc would also be "archive" + "arxiv.org": "repository", + "babel.hathitrust.org": "repository", + "cds.cern.ch": "repository", + "deepblue.lib.umich.edu": "repository", + "europepmc.org": "repository", + "hal.inria.fr": "repository", + "scielo.isciii.es": "repository", + "www.dtic.mil": "repository", + "www.jstage.jst.go.jp": "repository", + "www.jstor.org": "repository", + "www.ncbi.nlm.nih.gov": "repository", + "ftp.ncbi.nlm.nih.gov": "repository", + "www.scielo.br": "repository", + "www.scielo.cl": "repository", + "www.scielo.org.mx": "repository", + "zenodo.org": "repository", + "www.biorxiv.org": "repository", + "www.medrxiv.org": "repository", + "citeseerx.ist.psu.edu": "aggregator", + "publisher-connector.core.ac.uk": "aggregator", + "core.ac.uk": "aggregator", + "static.aminer.org": "aggregator", + "aminer.org": "aggregator", + "pdfs.semanticscholar.org": "aggregator", + "semanticscholar.org": "aggregator", + "www.semanticscholar.org": "aggregator", + "academic.oup.com": "publisher", + "cdn.elifesciences.org": "publisher", + "cell.com": "publisher", + "dl.acm.org": "publisher", + "downloads.hindawi.com": "publisher", + "elifesciences.org": "publisher", + "iopscience.iop.org": "publisher", + "journals.plos.org": "publisher", + "link.springer.com": "publisher", + "onlinelibrary.wiley.com": "publisher", + "works.bepress.com": "publisher", + "www.biomedcentral.com": "publisher", + "www.cell.com": "publisher", + "www.nature.com": "publisher", + "www.pnas.org": "publisher", + "www.tandfonline.com": "publisher", + "www.frontiersin.org": "publisher", + "www.degruyter.com": "publisher", + "www.mdpi.com": "publisher", + "www.ahajournals.org": "publisher", + "ehp.niehs.nih.gov": "publisher", + "journals.tsu.ru": "publisher", + "www.cogentoa.com": "publisher", + "www.researchgate.net": "academicsocial", + "academia.edu": "academicsocial", + "wayback.archive-it.org": "webarchive", + "web.archive.org": "webarchive", + "archive.is": "webarchive", +} + +# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly +PUBMED_RELEASE_TYPE_MAP = { + # Adaptive Clinical Trial + "Address": "speech", + "Autobiography": "book", + # Bibliography + "Biography": "book", + # Case Reports + "Classical Article": "article-journal", + # Clinical Conference + # Clinical Study + # Clinical Trial + # Clinical Trial, Phase I + # Clinical Trial, Phase II + # Clinical Trial, Phase III + # Clinical Trial, Phase IV + # Clinical Trial Protocol + # Clinical Trial, Veterinary + # Collected Works + # Comparative Study + # Congress + # Consensus Development Conference + # Consensus Development Conference, NIH + # Controlled Clinical Trial + "Dataset": "dataset", + # Dictionary + # Directory + # Duplicate Publication + "Editorial": "editorial", + # English Abstract # doesn't indicate that this is abstract-only + # Equivalence Trial + # Evaluation Studies + # Expression of Concern + # Festschrift + # Government Document + # Guideline + "Historical Article": "article-journal", + # Interactive Tutorial + "Interview": "interview", + "Introductory Journal Article": "article-journal", + "Journal Article": "article-journal", + "Lecture": "speech", + "Legal Case": "legal_case", + "Legislation": "legislation", + "Letter": "letter", + # Meta-Analysis + # Multicenter Study + # News + "Newspaper Article": "article-newspaper", + # Observational Study + # Observational Study, Veterinary + # Overall + # Patient Education Handout + # Periodical Index + # Personal Narrative + # Portrait + # Practice Guideline + # Pragmatic Clinical Trial + # Publication Components + # Publication Formats + # Publication Type Category + # Randomized Controlled Trial + # Research Support, American Recovery and Reinvestment Act + # Research Support, N.I.H., Extramural + # Research Support, N.I.H., Intramural + # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. + # Research Support, U.S. Gov't, P.H.S. + # Review # in the "literature review" sense, not "product review" + # Scientific Integrity Review + # Study Characteristics + # Support of Research + # Systematic Review + "Technical Report": "report", + # Twin Study + # Validation Studies + # Video-Audio Media + # Webcasts +} + +MONTH_ABBR_MAP: Dict[str, int] = { + "Jan": 1, + "01": 1, + "Feb": 2, + "02": 2, + "Mar": 3, + "03": 3, + "Apr": 4, + "04": 4, + "May": 5, + "05": 5, + "Jun": 6, + "06": 6, + "Jul": 7, + "07": 7, + "Aug": 8, + "08": 8, + "Sep": 9, + "09": 9, + "Oct": 10, + "10": 10, + "Nov": 11, + "11": 11, + "Dec": 12, + "12": 12, +} + +# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/ +COUNTRY_NAME_MAP: Dict[str, str] = { + "Afghanistan": "af", + "Albania": "al", + "Algeria": "dz", + "Andorra": "ad", + "Angola": "ao", + "Antigua and Barbuda": "ag", + "Argentina": "ar", + "Armenia": "am", + "Australia": "au", + "Austria": "at", + "Azerbaijan": "az", + "Bahamas": "bs", + "Bahrain": "bh", + "Bangladesh": "bd", + "Barbados": "bb", + "Belarus": "by", + "Belgium": "be", + "Belize": "bz", + "Benin": "bj", + "Bhutan": "bt", + "Bolivia": "bo", + "Bosnia and Herzegowina": "ba", + "Botswana": "bw", + "Brazil": "br", + "Brunei Darussalam": "bn", + "Bulgaria": "bg", + "Burkina Faso": "bf", + "Burundi": "bi", + "Cambodia": "kh", + "Cameroon": "cm", + "Canada": "ca", + "Cape Verde": "cv", + "Central African Republic": "cf", + "Chad": "td", + "Chile": "cl", + "China": "cn", + "Colombia": "co", + "Comoros": "km", + "Congo, Democratic Republic": "cd", + "Congo, People’s Republic": "cg", + "Costa Rica": "cr", + "Cote d'Ivoire": "ci", + "Croatia (Local Name: Hrvatska)": "hr", + "Cuba": "cu", + "Cyprus": "cy", + "Czech Republic": "cz", + "Denmark": "dk", + "Djibouti": "dj", + "Dominica": "dm", + "Dominican Republic": "do", + "East Timor": "tl", + "Ecuador": "ec", + "El Salvador": "sv", + "Equatorial Guinea": "gq", + "Eritrea": "er", + "Estonia": "ee", + "Ethiopia": "et", + "Fiji": "fj", + "Finland": "fi", + "France": "fr", + "Gabon": "ga", + "Gambia": "gm", + "Georgia": "ge", + "Germany": "de", + "Ghana": "gh", + "Greece": "gr", + "Greenland": "gl", + "Grenada": "gd", + "Guatemala": "gt", + "Guinea": "gn", + "Guinea-Bissau": "gw", + "Guyana": "gy", + "Haiti": "ht", + "Honduras": "hn", + "Hong Kong": "hk", + "Hungary": "hu", + "Iceland": "is", + "India": "in", + "Indonesia": "id", + "Iran": "ir", + "Iraq": "iq", + "Ireland": "ie", + "Israel": "il", + "Italy": "it", + "Jamaica": "jm", + "Japan": "jp", + "Jordan": "jo", + "Kazakhstan": "kz", + "Kenya": "ke", + "Kiribati": "ki", + "Korea, Democratic People's Republic": "kp", + "Korea, Republic": "kr", + "Kuwait": "kw", + "Kyrgyzstan": "kg", + "Laos": "la", + "Latvia": "lv", + "Lebanon": "lb", + "Lesotho": "ls", + "Liberia": "lr", + "Libya": "ly", + "Liechtenstein": "li", + "Lithuania": "lt", + "Luxembourg": "lu", + "Macedonia": "mk", + "Madagascar": "mg", + "Malawi": "mw", + "Malaysia": "my", + "Maldives": "mv", + "Mali": "ml", + "Malta": "mt", + "Marshall Islands": "mh", + "Mauritania": "mr", + "Mauritius": "mu", + "Mexico": "mx", + "Micronesia": "fm", + "Moldova": "md", + "Monaco": "mc", + "Mongolia": "mn", + "Morocco": "ma", + "Mozambique": "mz", + "Myanmar": "mm", + "Namibia": "na", + "Nauru": "nr", + "Nepal": "np", + "Netherlands": "nl", + "New Zealand": "nz", + "Nicaragua": "ni", + "Niger": "ne", + "Nigeria": "ng", + "Norway": "no", + "Oman": "om", + "Pakistan": "pk", + "Palau": "pw", + "Panama": "pa", + "Papua New Guinea": "pg", + "Paraguay": "py", + "Peru": "pe", + "Philippines": "ph", + "Poland": "pl", + "Portugal": "pt", + "Puerto Rico": "pr", + "Qatar": "qa", + "Romania": "ro", + "Russian Federation": "ru", + "Rwanda": "rw", + "Saint Kitts and Nevis": "kn", + "Saint Lucia": "lc", + "Saint Vincent and the Grenadines": "vc", + "Samoa": "ws", + "San Marino": "sm", + "Sao Tome and Príncipe": "st", + "Saudi Arabia": "sa", + "Senegal": "sn", + "Serbia and Montenegro": "cs", + "Seychelles": "sc", + "Sierra Leone": "sl", + "Singapore": "sg", + "Slovakia (Slovak Republic)": "sk", + "Slovenia": "si", + "Solomon Islands": "sb", + "Somalia": "so", + "South Africa": "za", + "Spain": "es", + "Sri Lanka": "lk", + "Sudan": "sd", + "Suriname": "sr", + "Swaziland": "sz", + "Sweden": "se", + "Switzerland": "ch", + "Syrian Arab Republic": "sy", + "Taiwan": "tw", + "Tajikistan": "tj", + "Tanzania": "tz", + "Tanzania": "tz", + "Thailand": "th", + "Togo": "tg", + "Tonga": "to", + "Trinidad and Tobago": "tt", + "Tunisia": "tn", + "Turkey": "tr", + "Turkmenistan": "tm", + "Tuvalu": "tv", + "Uganda": "ug", + "Ukraine": "ua", + "United Arab Emirates": "ae", + "United Kingdom": "gb", + "United States": "us", + "Uruguay": "uy", + # Additions from running over large files + "Bosnia and Herzegovina": "ba", + # "International" + "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn + "Russia (Federation)": "ru", + "Scotland": "gb", + "England": "gb", + "Korea (South)": "kr", + "Georgia (Republic)": "ge", + "Egypt": "eg", +} + +CONTAINER_TYPE_MAP: Dict[str, str] = { + "article-journal": "journal", + "paper-conference": "conference", + "book": "book-series", +} + +# These are based, informally, on sorting the most popular licenses found in +# Crossref metadata. There were over 500 unique strings and only a few most +# popular are here; many were variants of the CC URLs. Would be useful to +# normalize CC licenses better. +# The current norm is to only add license slugs that are at least partially OA. +# NOTE: URL patterns should be lower-case, and have any trailing slash ("/") +# removed. Slugs are usually upper-case acronyms +LICENSE_SLUG_MAP: Dict[str, str] = { + "//creativecommons.org/publicdomain/mark/1.0": "CC-0", + "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", + "//creativecommons.org/publicdomain/zero/1.0": "CC-0", + "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0", + "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", + "//creativecommons.org/share-your-work/public-domain/cc0": "CC-0", + "//creativecommons.org/licenses/by/2.0": "CC-BY", + "//creativecommons.org/licenses/by/3.0": "CC-BY", + "//creativecommons.org/licenses/by/4.0": "CC-BY", + "//creativecommons.org/licenses/by-sa/3.0": "CC-BY-SA", + "//creativecommons.org/licenses/by-sa/4.0": "CC-BY-SA", + "//creativecommons.org/licenses/by-nd/3.0": "CC-BY-ND", + "//creativecommons.org/licenses/by-nd/4.0": "CC-BY-ND", + "//creativecommons.org/licenses/by-nc/3.0": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc/4.0": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc-sa/3.0": "CC-BY-NC-SA", + "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-NC-SA", + "//creativecommons.org/licenses/by-nc-nd/3.0": "CC-BY-NC-ND", + "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-NC-ND", + "//creativecommons.org/share-your-work/public-domain/cc0": "CC-0", + "//spdx.org/licenses/cc0-1.0.json": "CC-0", + "//spdx.org/licenses/cc-by-1.0.json": "CC-BY", + "//spdx.org/licenses/cc-by-4.0.json": "CC-BY", + "//spdx.org/licenses/cc-by-nc-4.0.json": "CC-BY-NC", + "//spdx.org/licenses/cc-by-sa-3.0.json": "CC-BY-SA", + "//spdx.org/licenses/cc-by-sa-4.0.json": "CC-BY-SA", + "//spdx.org/licenses/mit.json": "MIT", + "//spdx.org/licenses/ogl-canada-2.0.json": "OGL-Canada", + "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0", + "//www.elsevier.com/tdm/userlicense/1.0": "ELSEVIER-USER-1.0", + "//www.karger.com/services/siteLicenses": "KARGER", + "//www.karger.com/services/siteLicenses": "KARGER", + "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml": "ADS-UK", + "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml": "ADS-UK", + "//archaeologydataservice.ac.uk/advice/termsofuseandaccess": "ADS-UK", + "//homepage.data-planet.com/terms-use": "SAGE-DATA-PLANET", + "//homepage.data-planet.com/terms-use": "SAGE-DATA-PLANET", + "//publikationen.bibliothek.kit.edu/kitopen-lizenz": "KIT-OPEN", + "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY", + "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY", + "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE", + "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE", + "//www.ametsoc.org/pubsreuselicenses": "AMETSOC", + "//www.ametsoc.org/pubsreuselicenses": "AMETSOC", + "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA", + "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA", + "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", + "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", + "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", + "//www.gnu.org/licenses/gpl-3.0.en.html": "GPLv3", + "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html": "GPLv2", + "//arxiv.org/licenses/nonexclusive-distrib/1.0": "ARXIV-1.0", + # skip these non-OA licenses + # //iopscience.iop.org/page/copyright is closed + # //www.acm.org/publications/policies/copyright_policy#Background is closed + # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!) + # skip these TDM licenses; they don't apply to content + # "//www.springer.com/tdm": "SPRINGER-TDM", + # "//journals.sagepub.com/page/policies/text-and-data-mining-license": "SAGE-TDM", + # "//doi.wiley.com/10.1002/tdm_license_1.1": "WILEY-TDM-1.1", + # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license + # //www.springer.com/tdm doesn't seem like a license + # //rsc.li/journals-terms-of-use is closed for vor (am open) +} + +# Map various datacite type types to CSL-ish types. None means TODO or remove. +DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = { + "ris": { + "THES": "thesis", + "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report) + "CHAP": "chapter", + "FIGURE": "figure", + "RPRT": "report", + "JOUR": "article-journal", + "MPCT": "motion_picture", + "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset + "BOOK": "book", + "DATA": "dataset", + "COMP": "software", + }, + "schemaOrg": { + "Dataset": "dataset", + "Book": "book", + "ScholarlyArticle": "article-journal", + "ImageObject": "graphic", + "Collection": None, + "MediaObject": None, + "Event": None, + "SoftwareSourceCode": "software", + "Chapter": "chapter", + "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. + "PublicationIssue": "article", + "AudioObject": None, + "Thesis": "thesis", + }, + "citeproc": { + "article": "article", + "article-journal": "article-journal", + "article-magazine": "article-magazine", + "article-newspaper": "article-newspaper", + "bill": "bill", + "book": "book", + "broadcast": "broadcast", + "chapter": "chapter", + "dataset": "dataset", + "entry-dictionary": "entry-dictionary", + "entry-encyclopedia": "entry-encyclopedia", + "entry": "entry", + "figure": "figure", + "graphic": "graphic", + "interview": "interview", + "legal_case": "legal_case", + "legislation": "legislation", + "manuscript": "manuscript", + "map": "map", + "motion_picture": "motion_picture", + "musical_score": "musical_score", + "pamphlet": "pamphlet", + "paper-conference": "paper-conference", + "patent": "patent", + "personal_communication": "personal_communication", + "post": "post", + "post-weblog": "post-weblog", + "report": "report", + "review-book": "review-book", + "review": "review", + "song": "song", + "speech": "speech", + "thesis": "thesis", + "treaty": "treaty", + "webpage": "webpage", + }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types + "bibtex": { + "phdthesis": "thesis", + "inbook": "chapter", + "misc": None, + "article": "article-journal", + "book": "book", + }, + "resourceTypeGeneral": { + "Image": "graphic", + "Dataset": "dataset", + "PhysicalObject": None, + "Collection": None, + "Text": None, # "Greyliterature, labnotes, accompanyingmaterials" + "Sound": None, + "InteractiveResource": None, + "Event": None, + "Software": "software", + "Other": None, + "Workflow": None, + "Audiovisual": None, + }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 +} diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 06ecfd58..654be2e9 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -13,10 +13,8 @@ To run an import you combine two classes; one each of: from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter from .arxiv import ArxivRawImporter -from .cdl_dash_dat import auto_cdl_dash_dat from .chocula import ChoculaImporter from .common import ( - LANG_MAP_MARC, Bs4XmlFileListPusher, Bs4XmlFilePusher, Bs4XmlLargeFilePusher, @@ -28,11 +26,8 @@ from .common import ( KafkaJsonPusher, LinePusher, SqlitePusher, - clean, - is_cjk, - make_kafka_consumer, ) -from .crossref import CROSSREF_TYPE_MAP, CrossrefImporter, lookup_license_slug +from .crossref import CrossrefImporter from .datacite import DataciteImporter from .dblp_container import DblpContainerImporter from .dblp_release import DblpReleaseImporter @@ -55,4 +50,3 @@ from .matched import MatchedImporter from .orcid import OrcidImporter from .pubmed import PubmedImporter from .shadow import ShadowLibraryImporter -from .wayback_static import auto_wayback_static diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index b4a4d9ed..92289bb3 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, FileEntity -from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url +from fatcat_tools.normal import b32_hex + +from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, make_rel_url ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL" diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 1d50dd9a..dd2c2284 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -9,6 +9,8 @@ from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseEntity from pylatexenc.latex2text import LatexNodes2Text +from fatcat_tools.normal import clean_doi + from .common import EntityImporter from .crossref import lookup_license_slug @@ -127,8 +129,8 @@ class ArxivRawImporter(EntityImporter): base_id = metadata.id.string doi = None if metadata.doi and metadata.doi.string: - doi = metadata.doi.string.lower().split()[0].strip() - if not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]): + doi = clean_doi(metadata.doi.string.lower().split()[0].strip()) + if doi and not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]): sys.stderr.write("BOGUS DOI: {}\n".format(doi)) doi = None title = latex_to_text(metadata.title.get_text().replace("\n", " ")) diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py deleted file mode 100755 index 1a4114a0..00000000 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python3 - -import hashlib -import json -import mimetypes -import os -import subprocess -import sys -import urllib -import urllib.parse -from typing import Any, Dict, List, Optional, Tuple - -import fatcat_openapi_client -import magic -from fatcat_openapi_client import ( - ApiClient, - Editgroup, - FilesetEntity, - FilesetFile, - ReleaseAbstract, - ReleaseContrib, - ReleaseEntity, - ReleaseExtIds, -) - -from .common import clean -from .crossref import lookup_license_slug - - -def single_file(prefix: str, path: str) -> FilesetFile: - - full = prefix + path - size_bytes = os.stat(full).st_size - - hashes = [ - hashlib.md5(), - hashlib.sha1(), - hashlib.sha256(), - ] - with open(full, "rb") as fp: - while True: - data = fp.read(2 ** 20) - if not data: - break - for h in hashes: - h.update(data) - mime = magic.Magic(mime=True).from_file(full) - if mime == "application/octet-stream": - # magic apparently isn't that great; try using filename as well - guess = mimetypes.guess_type(full)[0] - if guess: - mime = guess - - fsf = FilesetFile( - path=path, - size=size_bytes, - md5=hashes[0].hexdigest(), - sha1=hashes[1].hexdigest(), - sha256=hashes[2].hexdigest(), - extra=dict(mimetype=mime), - ) - return fsf - - -def make_manifest(base_dir: str) -> List[FilesetFile]: - manifest = [] - for root, dirs, files in os.walk(base_dir): - for f in files: - manifest.append(single_file(root, f)) - return manifest - - -def cdl_dash_release( - meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None -) -> ReleaseEntity: - - if not extra: - extra = dict() - - assert meta["identifier"]["type"] == "DOI" - doi = meta["identifier"]["value"].lower() - assert doi.startswith("10.") - - ark_id = None - for extid in meta.get("alternativeIdentifiers", []): - if extid["value"].startswith("ark:"): - ark_id = extid["value"] - assert ark_id - - license_slug = lookup_license_slug(meta["rights"]["uri"]) - - abstracts = [] - for desc in meta["descriptions"]: - if desc["type"] == "abstract": - abstracts.append( - ReleaseAbstract(mimetype="text/html", content=clean(desc["value"])) - ) - # print(abstracts) - - contribs = [] - for creator in meta["creator"]: - contribs.append( - ReleaseContrib( - given_name=creator["given"], - surname=creator["family"], - # sorry everybody - raw_name="{} {}".format(creator["given"], creator["family"]), - raw_affiliation=creator.get("affiliation"), - role="author", # presumably, for these datasets? - ) - ) - - r = ReleaseEntity( - ext_ids=ReleaseExtIds( - doi=doi, - ark=ark_id, - ), - title=clean(meta["title"], force_xml=True), - publisher=clean(meta["publisher"]), - release_year=int(meta["publicationYear"]), - release_type="dataset", - license_slug=license_slug, - contribs=contribs, - abstracts=abstracts or None, - extra=extra, - ) - return r - - -def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]: - - if dat_path.endswith("/"): - dat_path = dat_path[:-1] - dat_discovery = dat_path - extra = dict() - assert len(dat_discovery) == 64 - - with open(dat_path + "/cdl_dash_metadata.json", "r") as fp: - meta_dict = json.loads(fp.read()) - - release = cdl_dash_release(meta_dict) - ark_id = release.extra["ark_id"] - - dash_version = None - # really crude XML parse-out - with open(dat_path + "/stash-wrapper.xml", "r") as fp: - for line in fp: - line = line.strip() - if line.startswith("<st:version_number>"): - dash_version = int(line[19:].split("<")[0]) - assert dash_version is not None - extra["cdl_dash"] = dict(version=dash_version) - release.extra["cdl_dash"] = dict(version=dash_version) - - manifest = make_manifest(dat_path + "/files/") - - bundle_url = dict( - url="https://merritt.cdlib.org/u/{}/{}".format( - urllib.parse.quote(ark_id, safe=""), dash_version - ), - rel="repo-bundle", - ) - repo_url = dict( - url="https://merritt.cdlib.org/d/{}/{}/".format( - urllib.parse.quote(ark_id, safe=""), dash_version - ), - rel="repo", - ) - dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb") - fs = FilesetEntity( - urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra - ) - return (release, fs) - - -def auto_cdl_dash_dat( - api: ApiClient, - dat_path: str, - release_id: Optional[str] = None, - editgroup_id: Optional[str] = None, -) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]: - - git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") - - (release, fileset) = make_release_fileset(dat_path) - - if not editgroup_id: - eg = api.create_editgroup( - Editgroup( - description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", - extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"), - ) - ) - editgroup_id = eg.editgroup_id - - if not release_id and release.ext_ids.doi: - try: - r = api.lookup_release(doi=release.ext_ids.doi) - release_id = r.ident - except fatcat_openapi_client.rest.ApiException: - pass - if not release_id: - edit = api.create_release(eg.editgroup_id, release) - release_id = edit.ident - - release = api.get_release(release_id, expand="filesets") - if len(release.filesets): - print("A fileset already exists for release {}".format(release.ident)) - return (None, None, None) - - fileset.release_ids = [release.ident] - edit = api.create_fileset(eg.editgroup_id, fileset) - fileset = api.get_fileset(edit.ident) - return (editgroup_id, release, fileset) - - -if __name__ == "__main__": - # pass this a discovery key that has been cloned to the local directory - print(make_release_fileset(sys.argv[1])) diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 842c7853..c44fec3b 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ContainerEntity -from .common import EntityImporter, clean +from fatcat_tools.normal import clean_str + +from .common import EntityImporter class ChoculaImporter(EntityImporter): @@ -40,7 +42,7 @@ class ChoculaImporter(EntityImporter): returns a ContainerEntity (or None if invalid or couldn't parse) """ - name = clean(row.get("name")) + name = clean_str(row.get("name")) if not name: # Name is required (by schema) return None @@ -85,7 +87,7 @@ class ChoculaImporter(EntityImporter): ident=row["ident"], name=name, container_type=container_type, - publisher=clean(row.get("publisher")), + publisher=clean_str(row.get("publisher")), wikidata_qid=row.get("wikidata_qid"), extra=extra, ) diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 2ec6efda..e2157ee5 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -27,74 +27,14 @@ from fatcat_openapi_client import ( from fatcat_openapi_client.rest import ApiException from fuzzycat.matching import match_release_fuzzy -# TODO: refactor so remove need for this (re-imports for backwards compatibility) -from fatcat_tools.normal import is_cjk # noqa: F401 -from fatcat_tools.normal import LANG_MAP_MARC, b32_hex # noqa: F401 -from fatcat_tools.normal import clean_str as clean # noqa: F401 +from fatcat_tools.biblio_lookup_tables import DOMAIN_REL_MAP +from fatcat_tools.normal import clean_doi from fatcat_tools.transforms import entity_to_dict DATE_FMT: str = "%Y-%m-%d" SANE_MAX_RELEASES: int = 200 SANE_MAX_URLS: int = 100 - -DOMAIN_REL_MAP: Dict[str, str] = { - "archive.org": "archive", - # LOCKSS, Portico, DuraSpace, etc would also be "archive" - "arxiv.org": "repository", - "babel.hathitrust.org": "repository", - "cds.cern.ch": "repository", - "deepblue.lib.umich.edu": "repository", - "europepmc.org": "repository", - "hal.inria.fr": "repository", - "scielo.isciii.es": "repository", - "www.dtic.mil": "repository", - "www.jstage.jst.go.jp": "repository", - "www.jstor.org": "repository", - "www.ncbi.nlm.nih.gov": "repository", - "ftp.ncbi.nlm.nih.gov": "repository", - "www.scielo.br": "repository", - "www.scielo.cl": "repository", - "www.scielo.org.mx": "repository", - "zenodo.org": "repository", - "www.biorxiv.org": "repository", - "www.medrxiv.org": "repository", - "citeseerx.ist.psu.edu": "aggregator", - "publisher-connector.core.ac.uk": "aggregator", - "core.ac.uk": "aggregator", - "static.aminer.org": "aggregator", - "aminer.org": "aggregator", - "pdfs.semanticscholar.org": "aggregator", - "semanticscholar.org": "aggregator", - "www.semanticscholar.org": "aggregator", - "academic.oup.com": "publisher", - "cdn.elifesciences.org": "publisher", - "cell.com": "publisher", - "dl.acm.org": "publisher", - "downloads.hindawi.com": "publisher", - "elifesciences.org": "publisher", - "iopscience.iop.org": "publisher", - "journals.plos.org": "publisher", - "link.springer.com": "publisher", - "onlinelibrary.wiley.com": "publisher", - "works.bepress.com": "publisher", - "www.biomedcentral.com": "publisher", - "www.cell.com": "publisher", - "www.nature.com": "publisher", - "www.pnas.org": "publisher", - "www.tandfonline.com": "publisher", - "www.frontiersin.org": "publisher", - "www.degruyter.com": "publisher", - "www.mdpi.com": "publisher", - "www.ahajournals.org": "publisher", - "ehp.niehs.nih.gov": "publisher", - "journals.tsu.ru": "publisher", - "www.cogentoa.com": "publisher", - "www.researchgate.net": "academicsocial", - "academia.edu": "academicsocial", - "wayback.archive-it.org": "webarchive", - "web.archive.org": "webarchive", - "archive.is": "webarchive", -} +MAX_ABSTRACT_LENGTH: int = 2048 def make_rel_url(raw_url: str, default_link_rel: str = "web") -> Tuple[str, str]: @@ -342,8 +282,7 @@ class EntityImporter: return creator_id def is_doi(self, doi: str) -> bool: - # TODO: replace with clean_doi() from fatcat_tools.normal - return doi.startswith("10.") and doi.count("/") >= 1 + return clean_doi(doi) is not None def lookup_doi(self, doi: str) -> Optional[str]: """Caches calls to the doi lookup API endpoint in a local dict diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index a41e2bf5..52bd7465 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,11 +1,13 @@ import datetime -import sqlite3 from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity -from .common import EntityImporter, clean +from fatcat_tools.biblio_lookup_tables import CONTAINER_TYPE_MAP +from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug + +from .common import EntityImporter # The docs/guide should be the canonical home for these mappings; update there # first @@ -32,104 +34,11 @@ CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = { "standard": "standard", } -CONTAINER_TYPE_MAP: Dict[str, str] = { - "article-journal": "journal", - "paper-conference": "conference", - "book": "book-series", -} - -# These are based, informally, on sorting the most popular licenses found in -# Crossref metadata. There were over 500 unique strings and only a few most -# popular are here; many were variants of the CC URLs. Would be useful to -# normalize CC licenses better. -# The current norm is to only add license slugs that are at least partially OA. -LICENSE_SLUG_MAP: Dict[str, str] = { - "//creativecommons.org/publicdomain/mark/1.0": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0/": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", - "//creativecommons.org/publicdomain/zero/1.0/": "CC-0", - "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0", - "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0", - "//creativecommons.org/licenses/by/2.0/": "CC-BY", - "//creativecommons.org/licenses/by/3.0/": "CC-BY", - "//creativecommons.org/licenses/by/4.0/": "CC-BY", - "//creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", - "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", - "//creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", - "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", - "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", - "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", - "//creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", - "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", - "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", - "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", - "//spdx.org/licenses/CC0-1.0.json": "CC-0", - "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY", - "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY", - "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC", - "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA", - "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA", - "//spdx.org/licenses/MIT.json": "MIT", - "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada", - "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", - "//www.karger.com/Services/SiteLicenses": "KARGER", - "//pubs.acs.org/page/policy/authorchoice_termsofuse.html": "ACS-CHOICE", - "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html": "CC-BY", - "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER", - "//www.biologists.com/user-licence-1-1": "BIOLOGISTS-USER", - "//www.apa.org/pubs/journals/resources/open-access.aspx": "APA", - "//www.ametsoc.org/PUBSReuseLicenses": "AMETSOC", - # //onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license - # //www.springer.com/tdm doesn't seem like a license - # //iopscience.iop.org/page/copyright is closed - # //www.acm.org/publications/policies/copyright_policy#Background is closed - # //rsc.li/journals-terms-of-use is closed for vor (am open) - # //www.ieee.org/publications_standards/publications/rights/ieeecopyrightform.pdf is 404 (!) - "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", -} - - -def lookup_license_slug(raw: Optional[str]) -> Optional[str]: - if not raw: - return None - raw = raw.strip().replace("http://", "//").replace("https://", "//") - if "creativecommons.org" in raw.lower(): - raw = raw.lower() - raw = raw.replace("/legalcode", "/").replace("/uk", "") - if not raw.endswith("/"): - raw = raw + "/" - return LICENSE_SLUG_MAP.get(raw) - - -def test_lookup_license_slug() -> None: - - assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC" - assert ( - lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") - == "CC-BY" - ) - assert ( - lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") - == "CC-0" - ) - assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY" - assert ( - lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") - == "CC-BY-NC-SA" - ) - assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC" - assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None - assert lookup_license_slug("") is None - assert lookup_license_slug(None) is None - class CrossrefImporter(EntityImporter): """ Importer for Crossref metadata. - Can use a local sqlite3 file for faster "external identifier" lookups - See https://github.com/CrossRef/rest-api-doc for JSON schema notes """ @@ -150,50 +59,8 @@ class CrossrefImporter(EntityImporter): ) self.create_containers: bool = kwargs.get("create_containers", True) - extid_map_file = kwargs.get("extid_map_file") - self.extid_map_db: Optional[Any] = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri)) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map") - self.read_issn_map_file(issn_map_file) - def lookup_ext_ids(self, doi: str) -> Optional[Any]: - if self.extid_map_db is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] - ).fetchone() - if row is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = [str(cell or "") or None for cell in row] - return dict( - core_id=row[0], - pmid=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def map_release_type(self, crossref_type: str) -> Optional[str]: return CROSSREF_TYPE_MAP.get(crossref_type) @@ -275,21 +142,21 @@ class CrossrefImporter(EntityImporter): if len(affiliation_list) > 1: # note: affiliation => more_affiliations extra["more_affiliations"] = [ - clean(a["name"]) for a in affiliation_list[1:] + clean_str(a["name"]) for a in affiliation_list[1:] ] if am.get("sequence") and am.get("sequence") != "additional": - extra["seq"] = clean(am.get("sequence")) + extra["seq"] = clean_str(am.get("sequence")) assert ctype in ("author", "editor", "translator") - raw_name = clean(raw_name) + raw_name = clean_str(raw_name) # TODO: what if 'raw_name' is None? contribs.append( ReleaseContrib( creator_id=creator_id, index=index, raw_name=raw_name, - given_name=clean(am.get("given")), - surname=clean(am.get("family")), - raw_affiliation=clean(raw_affiliation), + given_name=clean_str(am.get("given")), + surname=clean_str(am.get("family")), + raw_affiliation=clean_str(raw_affiliation), role=ctype, extra=extra or None, ) @@ -306,11 +173,11 @@ class CrossrefImporter(EntityImporter): container_id = None if issnl: container_id = self.lookup_issnl(issnl) - publisher = clean(obj.get("publisher")) + publisher = clean_str(obj.get("publisher")) container_name = obj.get("container-title") if container_name: - container_name = clean(container_name[0], force_xml=True) + container_name = clean_str(container_name[0], force_xml=True) if not container_name: container_name = None if ( @@ -366,7 +233,7 @@ class CrossrefImporter(EntityImporter): ref_extra["journal-title"] = rm["journal-title"] if rm.get("DOI"): ref_extra["doi"] = rm.get("DOI").lower() - author = clean(rm.get("author")) + author = clean_str(rm.get("author")) if author: ref_extra["authors"] = [author] for k in ( @@ -390,8 +257,8 @@ class CrossrefImporter(EntityImporter): "series-title", "volume-title", ): - if clean(rm.get(k)): - ref_extra[k] = clean(rm[k]) + if clean_str(rm.get(k)): + ref_extra[k] = clean_str(rm[k]) refs.append( fatcat_openapi_client.ReleaseRef( index=i, @@ -399,9 +266,9 @@ class CrossrefImporter(EntityImporter): target_release_id=None, key=key, year=year, - container_name=clean(ref_container_name), - title=clean(rm.get("article-title")), - locator=clean(rm.get("first-page")), + container_name=clean_str(ref_container_name), + title=clean_str(rm.get("article-title")), + locator=clean_str(rm.get("first-page")), # TODO: just dump JSON somewhere here? extra=ref_extra or None, ) @@ -409,7 +276,7 @@ class CrossrefImporter(EntityImporter): # abstracts abstracts = [] - abstract = clean(obj.get("abstract")) + abstract = clean_str(obj.get("abstract")) if abstract and len(abstract) > 10: abstracts.append( fatcat_openapi_client.ReleaseAbstract( @@ -430,9 +297,9 @@ class CrossrefImporter(EntityImporter): if type(val) == list: val = val[0] if type(val) == str: - val = clean(val) + val = clean_str(val) if val: - extra[key] = clean(val) + extra[key] = clean_str(val) else: extra[key] = val # crossref-nested extra keys @@ -440,14 +307,14 @@ class CrossrefImporter(EntityImporter): val = obj.get(key) if val: if type(val) == str: - extra_crossref[key] = clean(val) + extra_crossref[key] = clean_str(val) else: extra_crossref[key] = val if license_extra: extra_crossref["license"] = license_extra if len(obj["title"]) > 1: - aliases = [clean(t) for t in obj["title"][1:]] + aliases = [clean_str(t) for t in obj["title"][1:]] aliases = [t for t in aliases if t] if aliases: extra["aliases"] = aliases @@ -473,9 +340,6 @@ class CrossrefImporter(EntityImporter): # unknown release_stage = None - # external identifiers - extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {} - # filter out unreasonably huge releases if len(abstracts) > 100: self.counts["skip-huge-abstracts"] += 1 @@ -505,19 +369,24 @@ class CrossrefImporter(EntityImporter): if obj.get("original-title"): ot = obj.get("original-title") if ot is not None: - original_title = clean(ot[0], force_xml=True) + original_title = clean_str(ot[0], force_xml=True) title: Optional[str] = None if obj.get("title"): - title = clean(obj["title"][0], force_xml=True) + title = clean_str(obj["title"][0], force_xml=True) if not title or len(title) <= 1: # title can't be just a single character self.counts["skip-blank-title"] += 1 return None + doi = clean_doi(obj["DOI"].lower()) + if not doi: + self.counts["skip-bad-doi"] += 1 + return None + subtitle = None if obj.get("subtitle"): - subtitle = clean(obj["subtitle"][0], force_xml=True) + subtitle = clean_str(obj["subtitle"][0], force_xml=True) if not subtitle or len(subtitle) <= 1: # subtitle can't be just a single character subtitle = None @@ -537,19 +406,13 @@ class CrossrefImporter(EntityImporter): release_year=release_year, publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( - doi=obj["DOI"].lower(), - pmid=extids["pmid"], - pmcid=extids["pmcid"], - wikidata_qid=extids["wikidata_qid"], + doi=doi, isbn13=isbn13, - core=extids["core_id"], - arxiv=extids["arxiv_id"], - jstor=extids["jstor_id"], ), - volume=clean(obj.get("volume")), - issue=clean(obj.get("issue")), - pages=clean(obj.get("page")), - language=clean(obj.get("language")), + volume=clean_str(obj.get("volume")), + issue=clean_str(obj.get("issue")), + pages=clean_str(obj.get("page")), + language=clean_str(obj.get("language")), license_slug=license_slug, extra=extra or None, abstracts=abstracts or None, diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index d5622960..b310f8bc 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -12,7 +12,6 @@ import collections import datetime import json import re -import sqlite3 import sys from typing import Any, Dict, List, Optional, Sequence, Set, Tuple @@ -22,113 +21,19 @@ import langdetect import pycountry from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity -from fatcat_tools.normal import clean_doi +from fatcat_tools.biblio_lookup_tables import DATACITE_TYPE_MAP +from fatcat_tools.normal import clean_doi, clean_str, lookup_license_slug from fatcat_tools.transforms import entity_to_dict -from .common import EntityImporter, clean - -# Cutoff length for abstracts. -MAX_ABSTRACT_LENGTH = 2048 +from .common import MAX_ABSTRACT_LENGTH, EntityImporter # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary -CONTAINER_TYPE_MAP: Dict[str, str] = { +DATACITE_CONTAINER_TYPE_MAP: Dict[str, str] = { "Journal": "journal", "Series": "journal", "Book Series": "book-series", } -# The docs/guide should be the canonical home for these mappings; update there -# first. Map various datacite type types to CSL-ish types. None means TODO or -# remove. -DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = { - "ris": { - "THES": "thesis", - "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report) - "CHAP": "chapter", - "FIGURE": "figure", - "RPRT": "report", - "JOUR": "article-journal", - "MPCT": "motion_picture", - "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset - "BOOK": "book", - "DATA": "dataset", - "COMP": "software", - }, - "schemaOrg": { - "Dataset": "dataset", - "Book": "book", - "ScholarlyArticle": "article-journal", - "ImageObject": "graphic", - "Collection": None, - "MediaObject": None, - "Event": None, - "SoftwareSourceCode": "software", - "Chapter": "chapter", - "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. - "PublicationIssue": "article", - "AudioObject": None, - "Thesis": "thesis", - }, - "citeproc": { - "article": "article", - "article-journal": "article-journal", - "article-magazine": "article-magazine", - "article-newspaper": "article-newspaper", - "bill": "bill", - "book": "book", - "broadcast": "broadcast", - "chapter": "chapter", - "dataset": "dataset", - "entry-dictionary": "entry-dictionary", - "entry-encyclopedia": "entry-encyclopedia", - "entry": "entry", - "figure": "figure", - "graphic": "graphic", - "interview": "interview", - "legal_case": "legal_case", - "legislation": "legislation", - "manuscript": "manuscript", - "map": "map", - "motion_picture": "motion_picture", - "musical_score": "musical_score", - "pamphlet": "pamphlet", - "paper-conference": "paper-conference", - "patent": "patent", - "personal_communication": "personal_communication", - "post": "post", - "post-weblog": "post-weblog", - "report": "report", - "review-book": "review-book", - "review": "review", - "song": "song", - "speech": "speech", - "thesis": "thesis", - "treaty": "treaty", - "webpage": "webpage", - }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types - "bibtex": { - "phdthesis": "thesis", - "inbook": "chapter", - "misc": None, - "article": "article-journal", - "book": "book", - }, - "resourceTypeGeneral": { - "Image": "graphic", - "Dataset": "dataset", - "PhysicalObject": None, - "Collection": None, - "Text": None, # "Greyliterature, labnotes, accompanyingmaterials" - "Sound": None, - "InteractiveResource": None, - "Event": None, - "Software": "software", - "Other": None, - "Workflow": None, - "Audiovisual": None, - }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 -} - # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. DATACITE_UNKNOWN_MARKERS: List[str] = [ "(:unac)", # temporarily inaccessible @@ -181,43 +86,6 @@ DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [ } ] -# TODO(martin): merge this with other maps and lookup functions, eventually. -LICENSE_SLUG_MAP: Dict[str, str] = { - "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK", - "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK", - "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", - "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1", - "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET", - "//onlinelibrary.wiley.com/termsandconditions/": "WILEY", - "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN", - "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY", - "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE", - "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC", - "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA", - "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER", - "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", - "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0", - "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3", - "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2", - "//www.karger.com/Services/SiteLicenses/": "KARGER", - "//www.springer.com/tdm/": "SPRINGER-TDM", - "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM", - "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0/": "CC-0", - "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", - "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0", - "//spdx.org/licenses/CC0-1.0.json": "CC-0", - "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY", - "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY", - "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC", - "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA", - "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA", - "//spdx.org/licenses/MIT.json": "MIT", - "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada", -} - class DataciteImporter(EntityImporter): """ @@ -248,15 +116,6 @@ class DataciteImporter(EntityImporter): ) self.create_containers = kwargs.get("create_containers", True) - extid_map_file = kwargs.get("extid_map_file") - self.extid_map_db = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri), file=sys.stderr) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map", file=sys.stderr) - self.read_issn_map_file(issn_map_file) self.debug = debug self.insert_log_file = insert_log_file @@ -264,42 +123,6 @@ class DataciteImporter(EntityImporter): print("datacite with debug={}".format(self.debug), file=sys.stderr) - def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: - """ - Return dictionary of identifiers referring to the same things as the given DOI. - """ - if self.extid_map_db is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] - ).fetchone() - if row is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = [str(cell or "") or None for cell in row] - return dict( - core_id=row[0], - pmid=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]: """ Mapping datacite JSON to ReleaseEntity. @@ -368,7 +191,7 @@ class DataciteImporter(EntityImporter): print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr) return False - title = clean(title) + title = clean_str(title) if not title: print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr) return False @@ -387,7 +210,7 @@ class DataciteImporter(EntityImporter): if not subtitle: subtitle = None else: - subtitle = clean(subtitle) + subtitle = clean_str(subtitle) # Dates. A few internal dates (registered, created, updated) and # published (0..2554). We try to work with typed date list, in @@ -445,15 +268,15 @@ class DataciteImporter(EntityImporter): publisher = None if publisher: - publisher = clean(publisher) + publisher = clean_str(publisher) # Container. For the moment, only ISSN as container. container_id = None container_name = None container = attributes.get("container", {}) or {} - if container.get("type") in CONTAINER_TYPE_MAP.keys(): - container_type = CONTAINER_TYPE_MAP.get(container["type"]) + if container.get("type") in DATACITE_CONTAINER_TYPE_MAP.keys(): + container_type = DATACITE_CONTAINER_TYPE_MAP.get(container["type"]) if container.get("identifier") and container.get("identifierType") == "ISSN": issn = container.get("identifier") if issn and len(issn) == 8: @@ -506,10 +329,10 @@ class DataciteImporter(EntityImporter): issue = container.get("issue") if volume: - volume = clean(volume) + volume = clean_str(volume) if issue: - issue = clean(issue) + issue = clean_str(issue) # Pages. pages = None @@ -534,7 +357,7 @@ class DataciteImporter(EntityImporter): license_extra = [] for lic in attributes.get("rightsList", []): - slug = lookup_license_slug(lic.get("rightsUri")) + slug = datacite_lookup_license_slug(lic.get("rightsUri")) if slug: license_slug = slug license_extra.append(lic) @@ -594,7 +417,7 @@ class DataciteImporter(EntityImporter): "[{}] language detection failed with {} on {}".format(doi, err, text), file=sys.stderr, ) - abstract_text = clean(text) + abstract_text = clean_str(text) if not abstract_text: continue abstracts.append( @@ -643,7 +466,13 @@ class DataciteImporter(EntityImporter): if license_extra: extra_datacite["license"] = license_extra if attributes.get("subjects"): - extra_datacite["subjects"] = attributes["subjects"] + # these subjects with schemeUri are too much metadata, which + # doesn't compress. filter them out. + extra_subjects = [ + subj for subj in attributes["subjects"] if not subj.get("schemeUri") + ] + if extra_subjects: + extra_datacite["subjects"] = extra_subjects # Include version information. metadata_version = attributes.get("metadataVersion") or "" @@ -706,8 +535,6 @@ class DataciteImporter(EntityImporter): if release_month: extra["release_month"] = release_month - extids = self.lookup_ext_ids(doi=doi) - # Assemble release. re = fatcat_openapi_client.ReleaseEntity( work_id=None, @@ -722,12 +549,6 @@ class DataciteImporter(EntityImporter): publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, - pmid=extids["pmid"], - pmcid=extids["pmcid"], - wikidata_qid=extids["wikidata_qid"], - core=extids["core_id"], - arxiv=extids["arxiv_id"], - jstor=extids["jstor_id"], ), contribs=contribs, volume=volume, @@ -922,14 +743,14 @@ class DataciteImporter(EntityImporter): if len(affiliations) == 0: raw_affiliation = None else: - raw_affiliation = clean(affiliations[0]) + raw_affiliation = clean_str(affiliations[0]) name = c.get("name") given_name = c.get("givenName") surname = c.get("familyName") if name: - name = clean(name) + name = clean_str(name) if not any((name, given_name, surname)): continue if not name: @@ -943,8 +764,8 @@ class DataciteImporter(EntityImporter): name = index_form_to_display_name(name) if given_name: - given_name = clean(given_name) - surname = clean(surname) + given_name = clean_str(given_name) + surname = clean_str(surname) # Perform a final assertion that name does not reduce to zero # (e.g. whitespace only name). @@ -1016,7 +837,7 @@ def contributor_list_contains_contributor( return False -def lookup_license_slug(raw: Optional[str]) -> Optional[str]: +def datacite_lookup_license_slug(raw: Optional[str]) -> Optional[str]: """ Resolve a variety of strings into a some pseudo-canonical form, e.g. CC-BY-ND, CC-0, MIT and so on. @@ -1111,12 +932,8 @@ def lookup_license_slug(raw: Optional[str]) -> Optional[str]: return None return "RS-{}".format(name.upper()) - # Fallback to mapped values. - raw = raw.lower() - raw = raw.strip().replace("http://", "//").replace("https://", "//") - if not raw.endswith("/"): - raw = raw + "/" - return LICENSE_SLUG_MAP.get(raw) + # Fallback to generic license lookup + return lookup_license_slug(raw) def find_original_language_title( diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index f5c886a2..92dbe574 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ReleaseEntity -from fatcat_tools.importers.common import EntityImporter +from fatcat_tools.importers.common import MAX_ABSTRACT_LENGTH, EntityImporter from fatcat_tools.normal import ( clean_doi, clean_orcid, @@ -24,9 +24,6 @@ from fatcat_tools.normal import ( parse_month, ) -# Cutoff length for abstracts. -MAX_ABSTRACT_LENGTH = 2048 - class DoajArticleImporter(EntityImporter): def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None: diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index e36e1b48..3c85132c 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -7,9 +7,9 @@ from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity -from .common import EntityImporter, clean, make_rel_url +from fatcat_tools.normal import clean_doi, clean_str -MAX_ABSTRACT_BYTES = 4096 +from .common import MAX_ABSTRACT_LENGTH, EntityImporter, make_rel_url class GrobidMetadataImporter(EntityImporter): @@ -82,9 +82,9 @@ class GrobidMetadataImporter(EntityImporter): extra_grobid: Dict[str, Any] = dict() abstract = obj.get("abstract") - if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10: + if abstract and len(abstract) < MAX_ABSTRACT_LENGTH and len(abstract) > 10: abobj = fatcat_openapi_client.ReleaseAbstract( - mimetype="text/plain", content=clean(obj.get("abstract")) + mimetype="text/plain", content=clean_str(obj.get("abstract")) ) abstracts = [abobj] else: @@ -95,9 +95,9 @@ class GrobidMetadataImporter(EntityImporter): contribs.append( fatcat_openapi_client.ReleaseContrib( index=i, - raw_name=clean(a["name"]), - given_name=clean(a.get("given_name")), - surname=clean(a.get("surname")), + raw_name=clean_str(a["name"]), + given_name=clean_str(a.get("given_name")), + surname=clean_str(a.get("surname")), role="author", extra=None, ) @@ -114,15 +114,15 @@ class GrobidMetadataImporter(EntityImporter): pass for key in ("volume", "url", "issue", "publisher"): if raw.get(key): - cite_extra[key] = clean(raw[key]) + cite_extra[key] = clean_str(raw[key]) if raw.get("authors"): - cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]] + cite_extra["authors"] = [clean_str(a["name"]) for a in raw["authors"]] refs.append( fatcat_openapi_client.ReleaseRef( - key=clean(raw.get("id")), + key=clean_str(raw.get("id")), year=year, - title=clean(raw["title"]), + title=clean_str(raw["title"]), extra=cite_extra or None, ) ) @@ -133,11 +133,12 @@ class GrobidMetadataImporter(EntityImporter): # only returns year, ever? release_year = int(obj["date"][:4]) - extra = dict() - if obj.get("doi"): - extra["doi"] = obj["doi"] + extra: Dict[str, Any] = dict() + doi = clean_doi(obj.get("doi")) + if doi: + extra["doi"] = doi if obj["journal"] and obj["journal"].get("name"): - extra["container_name"] = clean(obj["journal"]["name"]) + extra["container_name"] = clean_str(obj["journal"]["name"]) # TODO: ISSN/eISSN handling? or just journal name lookup? @@ -146,7 +147,7 @@ class GrobidMetadataImporter(EntityImporter): if self.longtail_oa: extra["longtail_oa"] = True - clean_title = clean(obj["title"], force_xml=True) + clean_title = clean_str(obj["title"], force_xml=True) if not clean_title or len(clean_title) < 2: return None title = clean_title @@ -158,9 +159,9 @@ class GrobidMetadataImporter(EntityImporter): release_year=release_year, contribs=contribs, refs=refs, - publisher=clean(obj["journal"].get("publisher")), - volume=clean(obj["journal"].get("volume")), - issue=clean(obj["journal"].get("issue")), + publisher=clean_str(obj["journal"].get("publisher")), + volume=clean_str(obj["journal"].get("volume")), + issue=clean_str(obj["journal"].get("issue")), abstracts=abstracts or None, ext_ids=fatcat_openapi_client.ReleaseExtIds(), extra=extra or None, diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 2f10e533..9916a55f 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,5 +1,4 @@ import datetime -import sqlite3 import sys from typing import Any, Dict, List, Optional, Sequence @@ -7,9 +6,9 @@ import fatcat_openapi_client from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity -from fatcat_tools.normal import clean_doi +from fatcat_tools.normal import clean_doi, clean_str, is_cjk -from .common import DATE_FMT, EntityImporter, clean, is_cjk +from .common import DATE_FMT, EntityImporter # TODO: should be List[Tag] not List[Any] for full type annotations @@ -37,13 +36,13 @@ def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]: for raw in raw_persons: name = raw.find("name") or None if name: - name = clean(name.get_text().replace("\n", " ")) + name = clean_str(name.get_text().replace("\n", " ")) surname = raw.find("familyName") or None if surname: - surname = clean(surname.get_text().replace("\n", " ")) + surname = clean_str(surname.get_text().replace("\n", " ")) given_name = raw.find("givenName") or None if given_name: - given_name = clean(given_name.get_text().replace("\n", " ")) + given_name = clean_str(given_name.get_text().replace("\n", " ")) lang = "en" if is_cjk(name): lang = "ja" @@ -117,50 +116,8 @@ class JalcImporter(EntityImporter): ) self.create_containers = kwargs.get("create_containers", True) - extid_map_file = kwargs.get("extid_map_file") - self.extid_map_db = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri)) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map") - self.read_issn_map_file(issn_map_file) - def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: - if self.extid_map_db is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] - ).fetchone() - if row is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = [str(cell or "") or None for cell in row] - return dict( - core_id=row[0], - pmid=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def want(self, raw_record: Any) -> bool: return True @@ -273,16 +230,16 @@ class JalcImporter(EntityImporter): for p in record.find_all("publicationName") if p.get_text() ] - pubs = [clean(p) for p in pubs if p] + pubs = [clean_str(p) for p in pubs if p] assert pubs if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] if len(pubs) > 1 and is_cjk(pubs[0]): # eng/jpn ordering is not reliable pubs = [pubs[1], pubs[0]] - container_name = clean(pubs[0]) + container_name = clean_str(pubs[0]) if len(pubs) > 1: - container_extra["original_name"] = clean(pubs[1]) + container_extra["original_name"] = clean_str(pubs[1]) if record.publisher: pubs = [ @@ -297,7 +254,7 @@ class JalcImporter(EntityImporter): # ordering is not reliable pubs = [pubs[1], pubs[0]] if pubs: - publisher = clean(pubs[0]) + publisher = clean_str(pubs[0]) if len(pubs) > 1: container_extra["publisher_aliases"] = pubs[1:] @@ -330,9 +287,6 @@ class JalcImporter(EntityImporter): # reasonable default for this collection release_type = "article-journal" - # external identifiers - extids = self.lookup_ext_ids(doi=doi) - # extra: # translation_of # aliases @@ -342,26 +296,20 @@ class JalcImporter(EntityImporter): # (informally) extra["jalc"] = extra_jalc - title = clean(title) + title = clean_str(title) if not title: return None re = ReleaseEntity( work_id=None, title=title, - original_title=clean(original_title), + original_title=clean_str(original_title), release_type=release_type, release_stage="published", release_date=release_date, release_year=release_year, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, - pmid=extids["pmid"], - pmcid=extids["pmcid"], - wikidata_qid=extids["wikidata_qid"], - core=extids["core_id"], - arxiv=extids["arxiv_id"], - jstor=extids["jstor_id"], ), volume=volume, issue=issue, diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index a45e49f3..fc1dfcbd 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -3,7 +3,9 @@ from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ContainerEntity -from .common import EntityImporter, clean +from fatcat_tools.normal import clean_str + +from .common import EntityImporter def or_none(s: Optional[str]) -> Optional[str]: @@ -105,7 +107,7 @@ class JournalMetadataImporter(EntityImporter): if extra_ia: extra["ia"] = extra_ia - name = clean(row.get("name")) + name = clean_str(row.get("name")) if not name: return None @@ -115,7 +117,7 @@ class JournalMetadataImporter(EntityImporter): issnp=row.get("issnp"), container_type=None, # TODO name=name, - publisher=clean(row.get("publisher")), + publisher=clean_str(row.get("publisher")), wikidata_qid=None, # TODO extra=extra, ) diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 2c8aa0a4..79691c9a 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -8,7 +8,10 @@ import fatcat_openapi_client from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseEntity -from .common import LANG_MAP_MARC, EntityImporter, clean +from fatcat_tools.biblio_lookup_tables import LANG_MAP_MARC +from fatcat_tools.normal import clean_doi, clean_str + +from .common import EntityImporter from .crossref import CONTAINER_TYPE_MAP # TODO: more entries? @@ -138,7 +141,7 @@ class JstorImporter(EntityImporter): issnl=issnl, publisher=publisher, container_type=self.map_container_type(release_type), - name=clean(journal_title, force_xml=True), + name=clean_str(journal_title, force_xml=True), ) ce_edit = self.create_container(ce) container_id = ce_edit.ident @@ -146,7 +149,9 @@ class JstorImporter(EntityImporter): doi = article_meta.find("article-id", {"pub-id-type": "doi"}) if doi: - doi = doi.string.lower().strip() + doi = clean_doi(doi.string.lower()) + else: + doi = None jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"}) if jstor_id: @@ -162,13 +167,13 @@ class JstorImporter(EntityImporter): for c in cgroup.find_all("contrib"): given = c.find("given-names") if given: - given = clean(given.get_text().replace("\n", " ")) + given = clean_str(given.get_text().replace("\n", " ")) surname = c.find("surname") if surname: - surname = clean(surname.get_text().replace("\n", " ")) + surname = clean_str(surname.get_text().replace("\n", " ")) raw_name = c.find("string-name") if raw_name: - raw_name = clean(raw_name.get_text().replace("\n", " ")) + raw_name = clean_str(raw_name.get_text().replace("\n", " ")) if not raw_name: if given and surname: @@ -230,7 +235,7 @@ class JstorImporter(EntityImporter): # JSTOR issue-id if article_meta.find("issue-id"): - issue_id = clean(article_meta.find("issue-id").string) + issue_id = clean_str(article_meta.find("issue-id").string) if issue_id: extra_jstor["issue_id"] = issue_id diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 430cdd0f..f3d82a86 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -4,7 +4,9 @@ from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, CreatorEntity -from .common import EntityImporter, clean +from fatcat_tools.normal import clean_str + +from .common import EntityImporter def value_or_none(e: Any) -> Any: @@ -65,14 +67,14 @@ class OrcidImporter(EntityImporter): if not self.is_orcid(orcid): sys.stderr.write("Bad ORCID: {}\n".format(orcid)) return None - display = clean(display) + display = clean_str(display) if not display: # must have *some* name return None ce = CreatorEntity( orcid=orcid, - given_name=clean(given), - surname=clean(sur), + given_name=clean_str(given), + surname=clean_str(sur), display_name=display, extra=extra, ) diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 1cdb450b..a6c7409d 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -8,317 +8,15 @@ import fatcat_openapi_client from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseEntity -from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid - -from .common import LANG_MAP_MARC, EntityImporter, clean - -# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly -PUBMED_RELEASE_TYPE_MAP = { - # Adaptive Clinical Trial - "Address": "speech", - "Autobiography": "book", - # Bibliography - "Biography": "book", - # Case Reports - "Classical Article": "article-journal", - # Clinical Conference - # Clinical Study - # Clinical Trial - # Clinical Trial, Phase I - # Clinical Trial, Phase II - # Clinical Trial, Phase III - # Clinical Trial, Phase IV - # Clinical Trial Protocol - # Clinical Trial, Veterinary - # Collected Works - # Comparative Study - # Congress - # Consensus Development Conference - # Consensus Development Conference, NIH - # Controlled Clinical Trial - "Dataset": "dataset", - # Dictionary - # Directory - # Duplicate Publication - "Editorial": "editorial", - # English Abstract # doesn't indicate that this is abstract-only - # Equivalence Trial - # Evaluation Studies - # Expression of Concern - # Festschrift - # Government Document - # Guideline - "Historical Article": "article-journal", - # Interactive Tutorial - "Interview": "interview", - "Introductory Journal Article": "article-journal", - "Journal Article": "article-journal", - "Lecture": "speech", - "Legal Case": "legal_case", - "Legislation": "legislation", - "Letter": "letter", - # Meta-Analysis - # Multicenter Study - # News - "Newspaper Article": "article-newspaper", - # Observational Study - # Observational Study, Veterinary - # Overall - # Patient Education Handout - # Periodical Index - # Personal Narrative - # Portrait - # Practice Guideline - # Pragmatic Clinical Trial - # Publication Components - # Publication Formats - # Publication Type Category - # Randomized Controlled Trial - # Research Support, American Recovery and Reinvestment Act - # Research Support, N.I.H., Extramural - # Research Support, N.I.H., Intramural - # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. - # Research Support, U.S. Gov't, P.H.S. - # Review # in the "literature review" sense, not "product review" - # Scientific Integrity Review - # Study Characteristics - # Support of Research - # Systematic Review - "Technical Report": "report", - # Twin Study - # Validation Studies - # Video-Audio Media - # Webcasts -} - -MONTH_ABBR_MAP = { - "Jan": 1, - "01": 1, - "Feb": 2, - "02": 2, - "Mar": 3, - "03": 3, - "Apr": 4, - "04": 4, - "May": 5, - "05": 5, - "Jun": 6, - "06": 6, - "Jul": 7, - "07": 7, - "Aug": 8, - "08": 8, - "Sep": 9, - "09": 9, - "Oct": 10, - "10": 10, - "Nov": 11, - "11": 11, - "Dec": 12, - "12": 12, -} - -# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/ -COUNTRY_NAME_MAP = { - "Afghanistan": "af", - "Albania": "al", - "Algeria": "dz", - "Andorra": "ad", - "Angola": "ao", - "Antigua and Barbuda": "ag", - "Argentina": "ar", - "Armenia": "am", - "Australia": "au", - "Austria": "at", - "Azerbaijan": "az", - "Bahamas": "bs", - "Bahrain": "bh", - "Bangladesh": "bd", - "Barbados": "bb", - "Belarus": "by", - "Belgium": "be", - "Belize": "bz", - "Benin": "bj", - "Bhutan": "bt", - "Bolivia": "bo", - "Bosnia and Herzegowina": "ba", - "Botswana": "bw", - "Brazil": "br", - "Brunei Darussalam": "bn", - "Bulgaria": "bg", - "Burkina Faso": "bf", - "Burundi": "bi", - "Cambodia": "kh", - "Cameroon": "cm", - "Canada": "ca", - "Cape Verde": "cv", - "Central African Republic": "cf", - "Chad": "td", - "Chile": "cl", - "China": "cn", - "Colombia": "co", - "Comoros": "km", - "Congo, Democratic Republic": "cd", - "Congo, People’s Republic": "cg", - "Costa Rica": "cr", - "Cote d'Ivoire": "ci", - "Croatia (Local Name: Hrvatska)": "hr", - "Cuba": "cu", - "Cyprus": "cy", - "Czech Republic": "cz", - "Denmark": "dk", - "Djibouti": "dj", - "Dominica": "dm", - "Dominican Republic": "do", - "East Timor": "tl", - "Ecuador": "ec", - "El Salvador": "sv", - "Equatorial Guinea": "gq", - "Eritrea": "er", - "Estonia": "ee", - "Ethiopia": "et", - "Fiji": "fj", - "Finland": "fi", - "France": "fr", - "Gabon": "ga", - "Gambia": "gm", - "Georgia": "ge", - "Germany": "de", - "Ghana": "gh", - "Greece": "gr", - "Greenland": "gl", - "Grenada": "gd", - "Guatemala": "gt", - "Guinea": "gn", - "Guinea-Bissau": "gw", - "Guyana": "gy", - "Haiti": "ht", - "Honduras": "hn", - "Hong Kong": "hk", - "Hungary": "hu", - "Iceland": "is", - "India": "in", - "Indonesia": "id", - "Iran": "ir", - "Iraq": "iq", - "Ireland": "ie", - "Israel": "il", - "Italy": "it", - "Jamaica": "jm", - "Japan": "jp", - "Jordan": "jo", - "Kazakhstan": "kz", - "Kenya": "ke", - "Kiribati": "ki", - "Korea, Democratic People's Republic": "kp", - "Korea, Republic": "kr", - "Kuwait": "kw", - "Kyrgyzstan": "kg", - "Laos": "la", - "Latvia": "lv", - "Lebanon": "lb", - "Lesotho": "ls", - "Liberia": "lr", - "Libya": "ly", - "Liechtenstein": "li", - "Lithuania": "lt", - "Luxembourg": "lu", - "Macedonia": "mk", - "Madagascar": "mg", - "Malawi": "mw", - "Malaysia": "my", - "Maldives": "mv", - "Mali": "ml", - "Malta": "mt", - "Marshall Islands": "mh", - "Mauritania": "mr", - "Mauritius": "mu", - "Mexico": "mx", - "Micronesia": "fm", - "Moldova": "md", - "Monaco": "mc", - "Mongolia": "mn", - "Morocco": "ma", - "Mozambique": "mz", - "Myanmar": "mm", - "Namibia": "na", - "Nauru": "nr", - "Nepal": "np", - "Netherlands": "nl", - "New Zealand": "nz", - "Nicaragua": "ni", - "Niger": "ne", - "Nigeria": "ng", - "Norway": "no", - "Oman": "om", - "Pakistan": "pk", - "Palau": "pw", - "Panama": "pa", - "Papua New Guinea": "pg", - "Paraguay": "py", - "Peru": "pe", - "Philippines": "ph", - "Poland": "pl", - "Portugal": "pt", - "Puerto Rico": "pr", - "Qatar": "qa", - "Romania": "ro", - "Russian Federation": "ru", - "Rwanda": "rw", - "Saint Kitts and Nevis": "kn", - "Saint Lucia": "lc", - "Saint Vincent and the Grenadines": "vc", - "Samoa": "ws", - "San Marino": "sm", - "Sao Tome and Príncipe": "st", - "Saudi Arabia": "sa", - "Senegal": "sn", - "Serbia and Montenegro": "cs", - "Seychelles": "sc", - "Sierra Leone": "sl", - "Singapore": "sg", - "Slovakia (Slovak Republic)": "sk", - "Slovenia": "si", - "Solomon Islands": "sb", - "Somalia": "so", - "South Africa": "za", - "Spain": "es", - "Sri Lanka": "lk", - "Sudan": "sd", - "Suriname": "sr", - "Swaziland": "sz", - "Sweden": "se", - "Switzerland": "ch", - "Syrian Arab Republic": "sy", - "Taiwan": "tw", - "Tajikistan": "tj", - "Tanzania": "tz", - "Tanzania": "tz", - "Thailand": "th", - "Togo": "tg", - "Tonga": "to", - "Trinidad and Tobago": "tt", - "Tunisia": "tn", - "Turkey": "tr", - "Turkmenistan": "tm", - "Tuvalu": "tv", - "Uganda": "ug", - "Ukraine": "ua", - "United Arab Emirates": "ae", - "United Kingdom": "gb", - "United States": "us", - "Uruguay": "uy", - # Additions from running over large files - "Bosnia and Herzegovina": "ba", - # "International" - "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn - "Russia (Federation)": "ru", - "Scotland": "gb", - "England": "gb", - "Korea (South)": "kr", - "Georgia (Republic)": "ge", - "Egypt": "eg", -} +from fatcat_tools.biblio_lookup_tables import ( + COUNTRY_NAME_MAP, + LANG_MAP_MARC, + MONTH_ABBR_MAP, + PUBMED_RELEASE_TYPE_MAP, +) +from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid, clean_str + +from .common import EntityImporter class PubmedImporter(EntityImporter): @@ -704,14 +402,14 @@ class PubmedImporter(EntityImporter): if extra_pubmed: extra["pubmed"] = extra_pubmed - title = clean(title) + title = clean_str(title) if not title: return None re = fatcat_openapi_client.ReleaseEntity( work_id=None, title=title, - original_title=clean(original_title), + original_title=clean_str(original_title), release_type=release_type, release_stage=release_stage, release_date=release_date, diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py deleted file mode 100755 index 5caed2c7..00000000 --- a/python/fatcat_tools/importers/wayback_static.py +++ /dev/null @@ -1,287 +0,0 @@ -#!/usr/bin/env python3 - -""" -Helpers to create Web Capture entities from extracted wayback content. - -Works as a stand-alone script (for debugging) or as library routines. -""" - -import argparse -import datetime -import hashlib -import json -import subprocess -import sys -from typing import Any, Dict, List, Optional, Tuple - -import requests -from bs4 import BeautifulSoup -from fatcat_openapi_client import ( - ApiClient, - Editgroup, - EntityEdit, - WebcaptureCdxLine, - WebcaptureEntity, - WebcaptureUrl, -) - -from .common import b32_hex - -CDX_API_BASE = "https://web.archive.org/cdx/search/cdx" -GWB_URL_BASE = "https://web.archive.org/web" -REQ_SESSION = requests.Session() - - -def parse_wbm_url(url: str) -> Tuple[str, datetime.datetime, str]: - """Takes a wayback machine URL, and returns a tuple: - - (timestamp, datetime, original_url) - """ - chunks = url.split("/") - assert len(chunks) >= 6 - assert chunks[2] == "web.archive.org" - assert chunks[3] == "web" - return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:])) - - -def test_parse_wbm_url() -> None: - u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html" - assert parse_wbm_url(u) == ( - "20010712114837", - datetime.datetime(2001, 7, 12, 11, 48, 37), - "http://www.dlib.org/dlib/june01/reich/06reich.html", - ) - - -def parse_wbm_timestamp(timestamp: str) -> datetime.datetime: - """ - Takes a complete WBM timestamp string (like "20020327115625") and returns a - python datetime object (UTC) - """ - # strip any "im_" or "id_" suffix - if timestamp.endswith("_"): - timestamp = timestamp[:-3] - # inflexible; require the full second-precision timestamp - assert len(timestamp) == 14 - return datetime.datetime( - year=int(timestamp[0:4]), - month=int(timestamp[4:6]), - day=int(timestamp[6:8]), - hour=int(timestamp[8:10]), - minute=int(timestamp[10:12]), - second=int(timestamp[12:14]), - ) - - -def test_parse_wbm_timestamp() -> None: - assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37) - - -def fetch_wbm(url: str) -> bytes: - resp = REQ_SESSION.get(url) - resp.raise_for_status() - assert resp.content - return resp.content - - -def lookup_cdx( - embed_url: str, verify_hashes: bool = True, cdx_output: Any = None -) -> Optional[WebcaptureCdxLine]: - sys.stderr.write(embed_url + "\n") - assert embed_url.startswith("/web/") - embed_url_segments = embed_url.split("/") - timestamp = embed_url_segments[2] - if timestamp.endswith("_"): - timestamp = timestamp[:-3] - url = "/".join(embed_url_segments[3:]) - # print((timestamp, url)) - params: Dict = dict( - url=url, - closest=timestamp, - sort="closest", - resolveRevisits="true", - matchType="exact", - limit=1, - ) - resp = REQ_SESSION.get( - CDX_API_BASE, - params=params, - ) - resp.raise_for_status() - # print(resp.url) - if resp.content: - hit = resp.content.decode("utf-8").split("\n")[0] - if cdx_output: - cdx_output.write(hit + "\n") - cdx_chunks = hit.split(" ") - cdx = [x if (x and x != "-") else None for x in cdx_chunks] - webcapture_cdx = WebcaptureCdxLine( - surt=cdx[0], - timestamp=parse_wbm_timestamp(cdx[1] or "").isoformat() + "Z", - url=cdx[2], - mimetype=cdx[3], - status_code=int(cdx[4] or ""), - sha1=b32_hex(cdx[5] or ""), - sha256=None, - ) - if verify_hashes: - resp = REQ_SESSION.get( - GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url) # raw timestamp - ) - resp.raise_for_status() - assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex() - webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex() - webcapture_cdx.size = len(resp.content) - return webcapture_cdx - else: - return None - - -def wayback_url_to_relative(url: str) -> Optional[str]: - """ - Wayback URLs can be relative or absolute in rewritten documents. This - function converts any form of rewritten URL to a relative (to - web.archive.org) one, or returns None if it isn't a rewritten URL at all. - """ - if url.startswith("https://web.archive.org/"): - url = url[23:] - elif url.startswith("http://web.archive.org/"): - url = url[22:] - - if url.startswith("/web/"): - return url - else: - return None - - -def extract_embeds(soup: BeautifulSoup) -> List[str]: - - embeds = set() - - # <link href=""> - for tag in soup.find_all("link", href=True): - if tag["rel"] not in ("stylesheet",): - continue - url = wayback_url_to_relative(tag["href"]) - if url: - embeds.add(url) - # <img src=""> - for tag in soup.find_all("img", src=True): - url = wayback_url_to_relative(tag["src"]) - if url: - embeds.add(url) - - # <script src=""> - for tag in soup.find_all("script", src=True): - url = wayback_url_to_relative(tag["src"]) - if url: - embeds.add(url) - - return list(embeds) - - -def static_wayback_webcapture(wayback_url: str, cdx_output: Any = None) -> WebcaptureEntity: - """ - Given a complete wayback machine capture URL, like: - - http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html - - Will return a new ("bare") fatcat webcapture entity python object, with all - the CDX entries filled in. - """ - - wbm_html = fetch_wbm(wayback_url) - raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) - # with open(rewritten_path, 'r') as fp: - # soup = BeautifulSoup(fp, "lxml") - soup = BeautifulSoup(wbm_html, "lxml") - embeds = extract_embeds(soup) - cdx_obj = lookup_cdx( - "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output - ) - cdx_list = [cdx_obj] - for url in embeds: - cdx_obj = lookup_cdx(url, cdx_output=cdx_output) - cdx_list.append(cdx_obj) - archive_urls = [ - WebcaptureUrl( - rel="wayback", - url="https://web.archive.org/web/", - ) - ] - wc = WebcaptureEntity( - cdx=cdx_list, - timestamp=timestamp.isoformat() + "Z", - original_url=original_url, - archive_urls=archive_urls, - release_ids=None, - ) - return wc - - -def auto_wayback_static( - api: ApiClient, release_id: str, wayback_url: str, editgroup_id: Optional[str] = None -) -> Tuple[Optional[str], Optional[EntityEdit]]: - """ - Returns a tuple: (editgroup_id, edit). If failed, both are None - """ - - raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) - git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") - - release = api.get_release(release_id, expand="webcaptures") - - # check for existing webcapture with same parameters - for wc in release.webcaptures: - if wc.original_url == original_url and wc.timestamp.date() == timestamp.date(): - # skipping: already existed - print( - "release {} already had webcapture {} {}".format( - release_id, raw_timestamp, original_url - ) - ) - return (None, None) - - wc = static_wayback_webcapture(wayback_url) - assert len(wc.cdx) >= 1 - wc.release_ids = [release_id] - if not editgroup_id: - eg = api.create_editgroup( - Editgroup( - description="One-off import of static web content from wayback machine", - extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"), - ) - ) - editgroup_id = eg.editgroup_id - edit = api.create_webcapture(eg.editgroup_id, wc) - return (editgroup_id, edit) - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--verbose", action="store_true", help="verbose output") - parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from") - parser.add_argument( - "--json-output", - type=argparse.FileType("w"), - default=sys.stdout, - help="where to write out webcapture entity (as JSON)", - ) - parser.add_argument( - "--cdx-output", - type=argparse.FileType("w"), - default=None, - help="(optional) file to write out CDX stub", - ) - - args = parser.parse_args() - - # entity-to-JSON code; duplicate of entity_to_dict() - api_client = ApiClient() - wc = static_wayback_webcapture(args.wayback_url, cdx_output=args.cdx_output) - wc_dict = api_client.sanitize_for_serialization(wc) - print(json.dumps(wc_dict)) - - -if __name__ == "__main__": - main() diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 34e5c3d1..dd0a4f74 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -12,6 +12,8 @@ import ftfy import langdetect import pycountry +from .biblio_lookup_tables import LICENSE_SLUG_MAP + DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$") @@ -47,7 +49,7 @@ def clean_doi(raw: Optional[str]) -> Optional[str]: raw = raw[8:] if raw.startswith("dx.doi.org/"): raw = raw[11:] - if raw[7:9] == "//": + if raw[7:9] == "//" and "10.1037//" in raw: raw = raw[:8] + raw[9:] # fatcatd uses same REGEX, but Rust regex rejects these characters, while @@ -74,6 +76,7 @@ def test_clean_doi() -> None: assert clean_doi("10.1234/asdf ") == "10.1234/asdf" assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" assert clean_doi("10.1037/0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" + assert clean_doi("10.1026//1616-1041.3.2.86") == "10.1026//1616-1041.3.2.86" assert clean_doi("10.23750/abm.v88i2 -s.6506") is None assert clean_doi("10.17167/mksz.2017.2.129–155") is None assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf" @@ -605,84 +608,38 @@ def test_parse_country_name() -> None: assert parse_country_name("Japan") == "jp" -# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of -# 2/T and 2/B? -# PubMed/MEDLINE and JSTOR use these MARC codes -# https://www.loc.gov/marc/languages/language_name.html -LANG_MAP_MARC = { - "afr": "af", - "alb": "sq", - "amh": "am", - "ara": "ar", - "arm": "hy", - "aze": "az", - "ben": "bn", - "bos": "bs", - "bul": "bg", - "cat": "ca", - "chi": "zh", - "cze": "cs", - "dan": "da", - "dut": "nl", - "eng": "en", - "epo": "eo", - "est": "et", - "fin": "fi", - "fre": "fr", - "geo": "ka", - "ger": "de", - "gla": "gd", - "gre": "el", - "heb": "he", - "hin": "hi", - "hrv": "hr", - "hun": "hu", - "ice": "is", - "ind": "id", - "ita": "it", - "jpn": "ja", - "kin": "rw", - "kor": "ko", - "lat": "la", - "lav": "lv", - "lit": "lt", - "mac": "mk", - "mal": "ml", - "mao": "mi", - "may": "ms", - "nor": "no", - "per": "fa", - "per": "fa", - "pol": "pl", - "por": "pt", - "pus": "ps", - "rum": "ro", - "rus": "ru", - "san": "sa", - "slo": "sk", - "slv": "sl", - "spa": "es", - "srp": "sr", - "swe": "sv", - "tha": "th", - "tur": "tr", - "ukr": "uk", - "urd": "ur", - "vie": "vi", - "wel": "cy", - # additions - "gle": "ga", # "Irish" (Gaelic) - "jav": "jv", # Javanese - "welsh": "cy", # Welsh - "oci": "oc", # Occitan - # Don't have ISO 639-1 codes - "grc": "el", # Ancient Greek; map to modern greek - "map": None, # Austronesian (collection) - "syr": None, # Syriac, Modern - "gem": None, # Old Saxon - "non": None, # Old Norse - "emg": None, # Eastern Meohang - "neg": None, # Negidal - "mul": None, # Multiple languages - "und": None, # Undetermined -} +def lookup_license_slug(raw: Optional[str]) -> Optional[str]: + if not raw: + return None + # normalize to lower-case and not ending with a slash + raw = raw.strip().lower() + if raw.endswith("/"): + raw = raw[:-1] + # remove http/https prefix + raw = raw.replace("http://", "//").replace("https://", "//") + # special-case normalization of CC licenses + if "creativecommons.org" in raw: + raw = raw.replace("/legalcode", "").replace("/uk", "") + return LICENSE_SLUG_MAP.get(raw) + + +def test_lookup_license_slug() -> None: + + assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC" + assert ( + lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") + == "CC-BY" + ) + assert ( + lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") + == "CC-0" + ) + assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY" + assert ( + lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") + == "CC-BY-NC-SA" + ) + assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC" + assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None + assert lookup_license_slug("") is None + assert lookup_license_slug(None) is None diff --git a/python/tests/files/datacite/datacite_result_00.json b/python/tests/files/datacite/datacite_result_00.json index 130a46ff..06147cfc 100644 --- a/python/tests/files/datacite/datacite_result_00.json +++ b/python/tests/files/datacite/datacite_result_00.json @@ -87,6 +87,5 @@ "release_type": "article-journal", "release_year": 2019, "title": "Synthesis and Crystal Structure of a Compound with Two Conformational Isomers: N-(2-methylbenzoyl)-N′-(4-nitrophenyl)thiourea", - "volume": "38", - "license_slug": "SPRINGER-TDM" + "volume": "38" } diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index eb931eb1..5f38e73e 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -10,17 +10,13 @@ from fatcat_tools.importers import CrossrefImporter, JsonLinePusher @pytest.fixture(scope="function") def crossref_importer(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield CrossrefImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True - ) + yield CrossrefImporter(api, issn_file, bezerk_mode=True) @pytest.fixture(scope="function") def crossref_importer_existing(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield CrossrefImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False - ) + yield CrossrefImporter(api, issn_file, bezerk_mode=False) @pytest.mark.skip( diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 220dc0f6..28884cda 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -15,9 +15,9 @@ from fatcat_tools.importers import DataciteImporter, JsonLinePusher from fatcat_tools.importers.datacite import ( clean_doi, contributor_list_contains_contributor, + datacite_lookup_license_slug, find_original_language_title, index_form_to_display_name, - lookup_license_slug, parse_datacite_dates, parse_datacite_titles, ) @@ -30,7 +30,6 @@ def datacite_importer(api): yield DataciteImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True, ) @@ -41,7 +40,6 @@ def datacite_importer_existing(api): yield DataciteImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False, ) @@ -465,9 +463,9 @@ def test_lookup_license_slug(): Case("http://creativecommons.org/licenses/by-nd/4.0/legalcode", "CC-BY-ND"), Case("http://creativecommons.org/licenses/by/2.0/uk/legalcode", "CC-BY"), Case("http://creativecommons.org/publicdomain/zero/1.0/legalcode", "CC-0"), - Case("http://doi.wiley.com/10.1002/tdm_license_1.1", "WILEY-TDM-1.1"), + Case("http://doi.wiley.com/10.1002/tdm_license_1.1", None), Case("http://homepage.data-planet.com/terms-use", "SAGE-DATA-PLANET"), - Case("http://www.springer.com/tdm", "SPRINGER-TDM"), + Case("http://www.springer.com/tdm", None), Case( "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml", "ADS-UK", @@ -479,11 +477,11 @@ def test_lookup_license_slug(): Case("https://www.elsevier.com/tdm/userlicense/1.0", "ELSEVIER-USER-1.0"), Case("https://www.gnu.org/licenses/gpl-3.0.html", "GPL-3.0"), Case("http://rightsstatements.org/page/InC/1.0?language=en", "RS-INC"), - Case("http://onlinelibrary.wiley.com/termsAndConditions", "WILEY"), + Case("http://onlinelibrary.wiley.com/termsAndConditions", None), Case("https://publikationen.bibliothek.kit.edu/kitopen-lizenz", "KIT-OPEN"), Case( "http://journals.sagepub.com/page/policies/text-and-data-mining-license", - "SAGE-TDM", + None, ), Case( "https://creativecommons.org/publicdomain/mark/1.0/deed.de", @@ -508,7 +506,7 @@ def test_lookup_license_slug(): ] for c in cases: - got = lookup_license_slug(c.input) + got = datacite_lookup_license_slug(c.input) assert c.output == got, "{}: got {}, want {}".format(c.input, got, c.output) diff --git a/python/tests/import_jalc.py b/python/tests/import_jalc.py index 4ebc87b4..8281b9a1 100644 --- a/python/tests/import_jalc.py +++ b/python/tests/import_jalc.py @@ -8,17 +8,13 @@ from fatcat_tools.importers import Bs4XmlFilePusher, Bs4XmlLinesPusher, JalcImpo @pytest.fixture(scope="function") def jalc_importer(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JalcImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True - ) + yield JalcImporter(api, issn_file, bezerk_mode=True) @pytest.fixture(scope="function") def jalc_importer_existing(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JalcImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False - ) + yield JalcImporter(api, issn_file, bezerk_mode=False) def test_jalc_importer(jalc_importer): diff --git a/python/tests/import_jstor.py b/python/tests/import_jstor.py index 8ad550b3..7e13c8b0 100644 --- a/python/tests/import_jstor.py +++ b/python/tests/import_jstor.py @@ -8,17 +8,13 @@ from fatcat_tools.importers import Bs4XmlFilePusher, JstorImporter @pytest.fixture(scope="function") def jstor_importer(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JstorImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True - ) + yield JstorImporter(api, issn_file, bezerk_mode=True) @pytest.fixture(scope="function") def jstor_importer_existing(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JstorImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False - ) + yield JstorImporter(api, issn_file, bezerk_mode=False) def test_jstor_importer(jstor_importer): diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py index a5301f29..e783db48 100644 --- a/python/tests/import_pubmed.py +++ b/python/tests/import_pubmed.py @@ -11,7 +11,6 @@ def pubmed_importer(api): yield PubmedImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True, lookup_refs=True, ) @@ -23,7 +22,6 @@ def pubmed_importer_existing(api): yield PubmedImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False, lookup_refs=True, ) |