diff options
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 3 | ||||
-rw-r--r-- | python/fatcat_tools/importers/common.py | 158 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 10 | ||||
-rw-r--r-- | python/fatcat_tools/importers/doaj_article.py | 358 | ||||
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 329 | ||||
-rw-r--r-- | python/fatcat_tools/normal.py | 333 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 318 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/ingest.py | 12 | ||||
-rw-r--r-- | python/fatcat_tools/workers/changelog.py | 28 |
10 files changed, 1198 insertions, 353 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index b82eb11a..d2928d09 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -27,6 +27,7 @@ from .orcid import OrcidImporter from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE from .wayback_static import auto_wayback_static from .cdl_dash_dat import auto_cdl_dash_dat -from .ingest import IngestFileResultImporter, SavePaperNowFileImporter +from .ingest import IngestFileResultImporter, SavePaperNowFileImporter, IngestWebResultImporter from .shadow import ShadowLibraryImporter from .file_meta import FileMetaImporter +from .doaj_article import DoajArticleImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 14415683..3c810391 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -3,12 +3,9 @@ import re import sys import csv import json -import ftfy -import base64 import sqlite3 import datetime import subprocess -import unicodedata from collections import Counter from confluent_kafka import Consumer, KafkaException import xml.etree.ElementTree as ET @@ -18,162 +15,13 @@ from bs4 import BeautifulSoup import fatcat_openapi_client from fatcat_openapi_client.rest import ApiException +# TODO: refactor so remove need for this (re-imports for backwards compatibility) +from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401 DATE_FMT = "%Y-%m-%d" SANE_MAX_RELEASES = 200 SANE_MAX_URLS = 100 -# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of -# 2/T and 2/B? -# PubMed/MEDLINE and JSTOR use these MARC codes -# https://www.loc.gov/marc/languages/language_name.html -LANG_MAP_MARC = { - 'afr': 'af', - 'alb': 'sq', - 'amh': 'am', - 'ara': 'ar', - 'arm': 'hy', - 'aze': 'az', - 'ben': 'bn', - 'bos': 'bs', - 'bul': 'bg', - 'cat': 'ca', - 'chi': 'zh', - 'cze': 'cs', - 'dan': 'da', - 'dut': 'nl', - 'eng': 'en', - 'epo': 'eo', - 'est': 'et', - 'fin': 'fi', - 'fre': 'fr', - 'geo': 'ka', - 'ger': 'de', - 'gla': 'gd', - 'gre': 'el', - 'heb': 'he', - 'hin': 'hi', - 'hrv': 'hr', - 'hun': 'hu', - 'ice': 'is', - 'ind': 'id', - 'ita': 'it', - 'jpn': 'ja', - 'kin': 'rw', - 'kor': 'ko', - 'lat': 'la', - 'lav': 'lv', - 'lit': 'lt', - 'mac': 'mk', - 'mal': 'ml', - 'mao': 'mi', - 'may': 'ms', - 'nor': 'no', - 'per': 'fa', - 'per': 'fa', - 'pol': 'pl', - 'por': 'pt', - 'pus': 'ps', - 'rum': 'ro', - 'rus': 'ru', - 'san': 'sa', - 'slo': 'sk', - 'slv': 'sl', - 'spa': 'es', - 'srp': 'sr', - 'swe': 'sv', - 'tha': 'th', - 'tur': 'tr', - 'ukr': 'uk', - 'urd': 'ur', - 'vie': 'vi', - 'wel': 'cy', - -# additions - 'gle': 'ga', # "Irish" (Gaelic) - 'jav': 'jv', # Javanese - 'welsh': 'cy', # Welsh - 'oci': 'oc', # Occitan - -# Don't have ISO 639-1 codes - 'grc': 'el', # Ancient Greek; map to modern greek - 'map': None, # Austronesian (collection) - 'syr': None, # Syriac, Modern - 'gem': None, # Old Saxon - 'non': None, # Old Norse - 'emg': None, # Eastern Meohang - 'neg': None, # Negidal - 'mul': None, # Multiple languages - 'und': None, # Undetermined -} - - -def clean(thing, force_xml=False): - """ - This function is appropriate to be called on any random, non-markup string, - such as author names, titles, etc. - - It will try to clean up common unicode mangles, HTML characters, etc. - - This will detect XML/HTML and "do the right thing" (aka, not remove - entities like '&' if there are tags in the string), unless you pass the - 'force_xml' parameter, which might be appropriate for, eg, names and - titles, which generally should be projected down to plain text. - - Also strips extra whitespace. - """ - if not thing: - return None - fix_entities = 'auto' - if force_xml: - fix_entities = True - fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip() - if not fixed or len(fixed) <= 1: - # wasn't zero-length before, but is now; return None - return None - return fixed - -def test_clean(): - - assert clean(None) == None - assert clean('') == None - assert clean('1') == None - assert clean('123') == '123' - assert clean('a&b') == 'a&b' - assert clean('<b>a&b</b>') == '<b>a&b</b>' - assert clean('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' - -def b32_hex(s): - s = s.strip().split()[0].lower() - if s.startswith("sha1:"): - s = s[5:] - if len(s) != 32: - return s - return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') - -def is_cjk(s): - if not s: - return False - for c in s: - if c.isalpha(): - lang_prefix = unicodedata.name(c).split()[0] - return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL') - return False - -def test_is_cjk(): - assert is_cjk(None) is False - assert is_cjk('') is False - assert is_cjk('blah') is False - assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True - assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True - assert is_cjk('菊') is True - assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True - assert is_cjk('水道') is True - assert is_cjk('オウ, イク') is True # kanji - assert is_cjk('ひヒ') is True - assert is_cjk('き゚ゅ') is True - assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True - DOMAIN_REL_MAP = { "archive.org": "archive", # LOCKSS, Portico, DuraSpace, etc would also be "archive" @@ -444,6 +292,7 @@ class EntityImporter: raise NotImplementedError def is_orcid(self, orcid): + # TODO: replace with clean_orcid() from fatcat_tools.normal return self._orcid_regex.match(orcid) is not None def lookup_orcid(self, orcid): @@ -464,6 +313,7 @@ class EntityImporter: return creator_id def is_doi(self, doi): + # TODO: replace with clean_doi() from fatcat_tools.normal return doi.startswith("10.") and doi.count("/") >= 1 def lookup_doi(self, doi): diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 71f08952..e77fa65e 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -487,8 +487,6 @@ class CrossrefImporter(EntityImporter): except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err - # doesn't exist, need to update - return True # eventually we'll want to support "updates", but for now just skip if # entity already exists diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 86740e80..70f8db86 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -151,7 +151,7 @@ UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set(( 'Unknown', ))) -# UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blacklist. +# UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blocklist. UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) # Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi @@ -346,7 +346,7 @@ class DataciteImporter(EntityImporter): print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) return False - # check for blacklisted "spam", e.g. "FULL MOVIE" + # check for blocklisted "spam", e.g. "FULL MOVIE" for rule in DATACITE_TITLE_SPAM_WORDGROUPS: seen = set() for token in rule.get("tokens", []): @@ -781,8 +781,6 @@ class DataciteImporter(EntityImporter): except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err - # doesn't exist, need to update - return True # eventually we'll want to support "updates", but for now just skip if # entity already exists @@ -819,7 +817,7 @@ class DataciteImporter(EntityImporter): contribs = [] # Names, that should be ignored right away. - name_blacklist = set(('Occdownload Gbif.Org',)) + name_blocklist = set(('Occdownload Gbif.Org',)) i = 0 for c in creators: @@ -861,7 +859,7 @@ class DataciteImporter(EntityImporter): continue if not name: name = "{} {}".format(given_name or '', surname or '').strip() - if name in name_blacklist: + if name in name_blocklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: continue diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py new file mode 100644 index 00000000..03752484 --- /dev/null +++ b/python/fatcat_tools/importers/doaj_article.py @@ -0,0 +1,358 @@ +""" +Importer for DOAJ article-level metadata, schema v1. + +DOAJ API schema and docs: https://doaj.org/api/v1/docs +""" + +import warnings +import datetime +from typing import List, Optional + +import fatcat_openapi_client +from fatcat_tools.normal import (clean_doi, clean_str, parse_month, + clean_orcid, detect_text_lang, parse_lang_name, parse_country_name, + clean_pmid, clean_pmcid) +from fatcat_tools.importers.common import EntityImporter + +# Cutoff length for abstracts. +MAX_ABSTRACT_LENGTH = 2048 + + +class DoajArticleImporter(EntityImporter): + + def __init__(self, + api, + issn_map_file, + **kwargs): + + eg_desc = kwargs.get( + 'editgroup_description', + "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps" + ) + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', + 'fatcat_tools.DoajArticleImporter') + # ensure default is to not do updates with this worker (override super() default) + kwargs['do_updates'] = kwargs.get("do_updates", False) + super().__init__(api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + self.this_year = datetime.datetime.now().year + self.read_issn_map_file(issn_map_file) + + def want(self, obj): + return True + + def parse_record(self, obj): + """ + bibjson { + abstract (string, optional), + author (Array[bibjson.author], optional), + identifier (Array[bibjson.identifier]), + journal (bibjson.journal, optional), + keywords (Array[string], optional), + link (Array[bibjson.link], optional), + month (string, optional), + subject (Array[bibjson.subject], optional), + title (string), + year (string, optional) + } + bibjson.journal { + country (string, optional), + end_page (string, optional), + language (Array[string], optional), + license (Array[bibjson.journal.license], optional), + number (string, optional), + publisher (string, optional), + start_page (string, optional), + title (string, optional), + volume (string, optional) + } + """ + + if not obj or not isinstance(obj, dict) or not 'bibjson' in obj: + self.counts['skip-empty'] += 1 + return None + + bibjson = obj['bibjson'] + + title = clean_str(bibjson.get('title'), force_xml=True) + if not title: + self.counts['skip-title'] += 1 + return False + + container_name = clean_str(bibjson['journal']['title']) + container_id = None + # NOTE: 'issns' not documented in API schema + for issn in bibjson['journal']['issns']: + issnl = self.issn2issnl(issn) + if issnl: + container_id = self.lookup_issnl(self.issn2issnl(issn)) + if container_id: + # don't store container_name when we have an exact match + container_name = None + break + + volume = clean_str(bibjson['journal'].get('volume')) + # NOTE: this schema seems to use "number" as "issue number" + issue = clean_str(bibjson['journal'].get('number')) + publisher = clean_str(bibjson['journal'].get('publisher')) + + try: + release_year = int(bibjson.get('year')) + except (TypeError, ValueError): + release_year = None + release_month = parse_month(clean_str(bibjson.get('month'))) + + # block bogus far-future years/dates + if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): + release_month = None + release_year = None + + license_slug = self.doaj_license_slug(bibjson['journal'].get('license')) + country = parse_country_name(bibjson['journal'].get('country')) + language = None + for raw in bibjson['journal'].get('language') or []: + language = parse_lang_name(raw) + if language: + break + + # pages + # NOTE: error in API docs? seems like start_page not under 'journal' object + start_page = clean_str(bibjson['journal'].get('start_page')) or clean_str(bibjson.get('start_page')) + end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str(bibjson.get('end_page')) + pages: Optional[str] = None + if start_page and end_page: + pages = f"{start_page}-{end_page}" + elif start_page: + pages = start_page + + doaj_article_id = obj['id'].lower() + ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id) + abstracts = self.doaj_abstracts(bibjson) + contribs = self.doaj_contribs(bibjson.get('author') or []) + + # DOAJ-specific extra + doaj_extra = dict() + if bibjson.get('subject'): + doaj_extra['subject'] = bibjson.get('subject') + if bibjson.get('keywords'): + doaj_extra['keywords'] = [k for k in [clean_str(s) for s in bibjson.get('keywords')] if k] + + # generic extra + extra = dict() + if country: + extra['country'] = country + if not container_id and container_name: + extra['container_name'] = container_name + if release_year and release_month: + # TODO: schema migration + extra['release_month'] = release_month + + if doaj_extra: + extra['doaj'] = doaj_extra + if not extra: + extra = None + + re = fatcat_openapi_client.ReleaseEntity( + work_id=None, + container_id=container_id, + release_type='article-journal', + release_stage='published', + title=title, + release_year=release_year, + #release_date, + publisher=publisher, + ext_ids=ext_ids, + contribs=contribs, + volume=volume, + issue=issue, + pages=pages, + language=language, + abstracts=abstracts, + extra=extra, + license_slug=license_slug, + ) + re = self.biblio_hacks(re) + return re + + @staticmethod + def biblio_hacks(re): + """ + This function handles known special cases. For example, + publisher-specific or platform-specific workarounds. + """ + return re + + def try_update(self, re): + + # lookup existing release by DOAJ article id + existing = None + try: + existing = self.api.lookup_release(doaj=re.ext_ids.doaj) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + # then try other ext_id lookups + if not existing: + for extid_type in ('doi', 'pmid', 'pmcid'): + extid_val = getattr(re.ext_ids, extid_type) + if not extid_val: + continue + #print(f" lookup release type: {extid_type} val: {extid_val}") + try: + existing = self.api.lookup_release(**{extid_type: extid_val}) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + if existing: + if existing.ext_ids.doaj: + warn_str = f"unexpected DOAJ ext_id match after lookup failed doaj={re.ext_ids.doaj} ident={existing.ident}" + warnings.warn(warn_str) + self.counts["skip-doaj-id-mismatch"] += 1 + return False + break + + # TODO: in the future could do fuzzy match here, eg using elasticsearch + + # create entity + if not existing: + return True + + # other logic could go here about skipping updates + if not self.do_updates or existing.ext_ids.doaj: + self.counts['exists'] += 1 + return False + + # fields to copy over for update + existing.ext_ids.doaj = existing.ext_ids.doaj or re.ext_ids.doaj + existing.release_type = existing.release_type or re.release_type + existing.release_stage = existing.release_stage or re.release_stage + existing.container_id = existing.container_id or re.container_id + existing.abstracts = existing.abstracts or re.abstracts + existing.extra['doaj'] = re.extra['doaj'] + existing.volume = existing.volume or re.volume + existing.issue = existing.issue or re.issue + existing.pages = existing.pages or re.pages + existing.language = existing.language or re.language + + try: + self.api.update_release(self.get_editgroup_id(), existing.ident, existing) + self.counts['update'] += 1 + except fatcat_openapi_client.rest.ApiException as err: + # there is a code path where we try to update the same release + # twice in a row; if that happens, just skip + # NOTE: API behavior might change in the future? + if "release_edit_editgroup_id_ident_id_key" in err.body: + self.counts['skip-update-conflict'] += 1 + return False + else: + raise err + + return False + + def insert_batch(self, batch): + self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + + def doaj_abstracts(self, bibjson: dict) -> List[fatcat_openapi_client.ReleaseAbstract]: + text = clean_str(bibjson.get('abstract')) + if not text or len(text) < 10: + return [] + if len(text) > MAX_ABSTRACT_LENGTH: + text = text[:MAX_ABSTRACT_LENGTH] + " [...]" + + lang = detect_text_lang(text) + + abstract = fatcat_openapi_client.ReleaseAbstract( + mimetype="text/plain", + content=text, + lang=lang, + ) + + return [abstract,] + + def doaj_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]: + """ + bibjson.author { + affiliation (string, optional), + name (string), + orcid_id (string, optional) + } + """ + contribs = [] + index = 0 + for author in authors: + if not author.get('name'): + continue + creator_id = None + orcid = clean_orcid(author.get('orcid_id')) + if orcid: + creator_id = self.lookup_orcid(orcid) + contribs.append(fatcat_openapi_client.ReleaseContrib( + raw_name=author.get('name'), + role='author', + index=index, + creator_id=creator_id, + raw_affiliation=clean_str(author.get('affiliation')), + )) + index += 1 + return contribs + + def doaj_ext_ids(self, identifiers: List[dict], doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds: + """ + bibjson.identifier { + id (string), + type (string) + } + """ + + assert doaj_article_id.isalnum() and len(doaj_article_id) == 32 + + doi: Optional[str] = None + pmid: Optional[str] = None + pmcid: Optional[str] = None + for id_obj in identifiers: + if not id_obj.get('id'): + continue + if id_obj['type'].lower() == 'doi': + doi = clean_doi(id_obj['id']) + elif id_obj['type'].lower() == 'pmid': + pmid = clean_pmid(id_obj['id']) + elif id_obj['type'].lower() == 'pmcid': + pmcid = clean_pmcid(id_obj['id']) + + return fatcat_openapi_client.ReleaseExtIds( + doaj=doaj_article_id, + doi=doi, + pmid=pmid, + pmcid=pmcid, + ) + + def doaj_license_slug(self, license_list: List[dict]) -> Optional[str]: + """ + bibjson.journal.license { + open_access (boolean, optional), + title (string, optional), + type (string, optional), + url (string, optional), + version (string, optional) + } + """ + if not license_list: + return None + for license in license_list: + if not license.get('open_access'): + continue + slug = license.get('type') + if slug.startswith('CC '): + slug = slug.replace('CC ', 'cc-').lower() + return slug + return None diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 4b1d3702..1e04e712 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -1,4 +1,6 @@ +import datetime + import fatcat_openapi_client from .common import EntityImporter, make_rel_url @@ -20,10 +22,10 @@ class IngestFileResultImporter(EntityImporter): assert self.default_link_rel self.require_grobid = require_grobid if self.require_grobid: - print("Requiring GROBID status == 200") + print("Requiring GROBID status == 200 (for PDFs)") else: print("NOT checking GROBID success") - self.ingest_request_source_whitelist = [ + self.ingest_request_source_allowlist = [ 'fatcat-changelog', 'fatcat-ingest-container', 'fatcat-ingest', @@ -35,23 +37,41 @@ class IngestFileResultImporter(EntityImporter): 's2-corpus', 's2', ] - if kwargs.get('skip_source_whitelist', False): - self.ingest_request_source_whitelist = [] + if kwargs.get('skip_source_allowlist', False): + self.ingest_request_source_allowlist = [] - def want(self, row): + def want_file(self, row) -> bool: + """ + File-specific part of want(). Generic across general ingest and save-paper-now. """ - Logic here probably needs work (TODO): - - Direct ingests via DOI from fatcat-changelog should probably go - through regardless of GROBID status - - We should filter/block things like single-page PDFs here - - public/anonymous submissions could require successful biblio-glutton - match, or some other sanity check on the fatcat side (eg, fuzzy title - match) - - handle the case of release_stage not being 'published'; if pre-print, - potentially create a new release. + if not row.get('file_meta'): + self.counts['skip-file-meta'] += 1 + return False - The current logic is intentionally conservative as a first step. + # type-specific filters + if row['request'].get('ingest_type') == 'pdf': + if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: + self.counts['skip-grobid'] += 1 + return False + if row['file_meta'].get('mimetype') not in ("application/pdf",): + self.counts['skip-mimetype'] += 1 + return False + elif row['request'].get('ingest_type') == 'xml': + if row['file_meta'].get('mimetype') not in ("application/xml", + "application/jats+xml", "application/tei+xml", "text/xml"): + self.counts['skip-mimetype'] += 1 + return False + else: + self.counts['skip-ingest-type'] += 1 + return False + + return True + + def want_ingest(self, row) -> bool: + """ + Sandcrawler ingest-specific part of want(). Generic across file and + webcapture ingest. """ if row.get('hit') != True: self.counts['skip-hit'] += 1 @@ -60,33 +80,48 @@ class IngestFileResultImporter(EntityImporter): if not source: self.counts['skip-ingest_request_source'] += 1 return False - if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist: + if self.ingest_request_source_allowlist and source not in self.ingest_request_source_allowlist: self.counts['skip-ingest_request_source'] += 1 return False - if source.startswith('arabesque'): - if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2'): - self.counts['skip-arabesque-source'] += 1 - return False + + if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2'): + self.counts['skip-link-source'] += 1 + return False + if source.startswith('savepapernow'): # never process async savepapernow requests self.counts['skip-savepapernow'] += 1 return False - if not row.get('file_meta'): - self.counts['skip-file-meta'] += 1 + + return True + + def want(self, row): + """ + Overall logic here probably needs work (TODO): + + - Direct ingests via DOI from fatcat-changelog should probably go + through regardless of GROBID status + - We should filter/block things like single-page PDFs here + - public/anonymous submissions could require successful biblio-glutton + match, or some other sanity check on the fatcat side (eg, fuzzy title + match) + - handle the case of release_stage not being 'published'; if pre-print, + potentially create a new release. + + The current logic is intentionally conservative as a first step. + """ + if not self.want_file(row): return False - if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: - self.counts['skip-grobid'] += 1 + if not self.want_ingest(row): return False return True - def parse_record(self, row): + def parse_ingest_release_ident(self, row): request = row['request'] fatcat = request.get('fatcat') - file_meta = row['file_meta'] - # identify release by fatcat ident, or extid lookup, or biblio-glutton match release_ident = None if fatcat and fatcat.get('release_ident'): release_ident = fatcat.get('release_ident') @@ -112,23 +147,21 @@ class IngestFileResultImporter(EntityImporter): return None release_ident = release.ident break + if self.use_glutton_match and not release_ident and row.get('grobid'): # try biblio-glutton extracted hit if row['grobid'].get('fatcat_release'): release_ident = row['grobid']['fatcat_release'].split('_')[-1] self.counts['glutton-match'] += 1 - if not release_ident: - self.counts['skip-release-not-found'] += 1 - return None + return release_ident + def parse_terminal(self, row): terminal = row.get('terminal') if not terminal: # support old cdx-only ingest results cdx = row.get('cdx') if not cdx: - # TODO: support archive.org hits? - self.counts['skip-no-terminal'] += 1 return None else: terminal = { @@ -142,7 +175,15 @@ class IngestFileResultImporter(EntityImporter): terminal['terminal_url'] = terminal['url'] if not 'terminal_dt' in terminal: terminal['terminal_dt'] = terminal['dt'] + + # convert CDX-style digits to ISO-style timestamp assert len(terminal['terminal_dt']) == 14 + terminal['terminal_timestamp'] = datetime.datetime.strptime(terminal['terminal_dt'], "%Y%m%d%H%M%S").isoformat() + "Z" + return terminal + + def parse_urls(self, row, terminal): + + request = row['request'] default_rel = self.default_link_rel if request.get('link_source') == 'doi': @@ -159,6 +200,55 @@ class IngestFileResultImporter(EntityImporter): urls = [url, ("webarchive", wayback)] urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] + return urls + + def parse_edit_extra(self, row): + + request = row['request'] + edit_extra = dict() + + if request.get('edit_extra'): + edit_extra = request['edit_extra'] + + if request.get('ingest_request_source'): + edit_extra['ingest_request_source'] = request['ingest_request_source'] + if request.get('link_source') and request.get('link_source_id'): + edit_extra['link_source'] = request['link_source'] + edit_extra['link_source_id'] = request['link_source_id'] + + return edit_extra + + def parse_record(self, row): + + request = row['request'] + file_meta = row['file_meta'] + + # double check that want() filtered request correctly (eg, old requests) + if request.get('ingest_type') not in ('pdf', 'xml'): + self.counts['skip-ingest-type'] += 1 + return None + assert (request['ingest_type'], file_meta['mimetype']) in [ + ("pdf", "application/pdf"), + ("xml", "application/xml"), + ("xml", "application/jats+xml"), + ("xml", "application/tei+xml"), + ("xml", "text/xml"), + ] + + # identify release by fatcat ident, or extid lookup, or biblio-glutton match + release_ident = self.parse_ingest_release_ident(row) + + if not release_ident: + self.counts['skip-release-not-found'] += 1 + return None + + terminal = self.parse_terminal(row) + if not terminal: + # TODO: support archive.org hits? + self.counts['skip-no-terminal'] += 1 + return None + + urls = self.parse_urls(row, terminal) fe = fatcat_openapi_client.FileEntity( md5=file_meta['md5hex'], @@ -169,17 +259,10 @@ class IngestFileResultImporter(EntityImporter): release_ids=[release_ident], urls=urls, ) - if request.get('edit_extra'): - fe.edit_extra = request['edit_extra'] - else: - fe.edit_extra = dict() - if request.get('ingest_request_source'): - fe.edit_extra['ingest_request_source'] = request['ingest_request_source'] - if request.get('link_source') and request.get('link_source_id'): - fe.edit_extra['link_source'] = request['link_source'] - fe.edit_extra['link_source_id'] = request['link_source_id'] - if not fe.edit_extra: - fe.edit_extra = None + + edit_extra = self.parse_edit_extra(row) + if edit_extra: + fe.edit_extra = edit_extra return fe def try_update(self, fe): @@ -244,6 +327,9 @@ class SavePaperNowFileImporter(IngestFileResultImporter): def want(self, row): + if not self.want_file(row): + return False + source = row['request'].get('ingest_request_source') if not source: self.counts['skip-ingest_request_source'] += 1 @@ -254,12 +340,6 @@ class SavePaperNowFileImporter(IngestFileResultImporter): if row.get('hit') != True: self.counts['skip-hit'] += 1 return False - if not row.get('file_meta'): - self.counts['skip-file-meta'] += 1 - return False - if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: - self.counts['skip-grobid'] += 1 - return False return True @@ -280,3 +360,154 @@ class SavePaperNowFileImporter(IngestFileResultImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) + + +class IngestWebResultImporter(IngestFileResultImporter): + """ + Variant of IngestFileResultImporter for processing HTML ingest requests + into webcapture objects. + """ + + def __init__(self, api, **kwargs): + + eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled from web using sandcrawler ingest tool" + eg_extra = kwargs.pop('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestWebResultImporter') + kwargs['do_updates'] = False + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + def want(self, row): + + if not self.want_ingest(row): + return False + + if not row.get('file_meta'): + self.counts['skip-file-meta'] += 1 + return False + + # webcapture-specific filters + if row['request'].get('ingest_type') != 'html': + self.counts['skip-ingest-type'] += 1 + return False + if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"): + self.counts['skip-mimetype'] += 1 + return False + + return True + + def parse_record(self, row): + + request = row['request'] + file_meta = row['file_meta'] + + # double check that want() filtered request correctly (eg, old requests) + if request.get('ingest_type') != "html": + self.counts['skip-ingest-type'] += 1 + return None + if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"): + self.counts['skip-mimetype'] += 1 + return None + + # identify release by fatcat ident, or extid lookup + release_ident = self.parse_ingest_release_ident(row) + + if not release_ident: + self.counts['skip-release-not-found'] += 1 + return None + + terminal = self.parse_terminal(row) + if not terminal: + # TODO: support archive.org hits? + self.counts['skip-no-terminal'] += 1 + return None + + urls = self.parse_urls(row, terminal) + archive_urls = [u for u in urls if u.rel == 'webarchive'] + + if terminal['terminal_status_code'] != 200: + self.counts['skip-terminal-status-code'] += 1 + return None + + terminal_cdx = row['cdx'] + if 'revisit_cdx' in row: + terminal_cdx = row['revisit_cdx'] + assert terminal_cdx['surt'] + assert terminal_cdx['url'] == terminal['terminal_url'] + + wc_cdx = [] + # primary resource first + wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( + surt=terminal_cdx['surt'], + timestamp=terminal['terminal_timestamp'], + url=terminal['terminal_url'], + mimetype=file_meta['mimetype'], + status_code=terminal['terminal_status_code'], + sha1=file_meta['sha1hex'], + sha256=file_meta['sha256hex'], + size=file_meta['size_bytes'], + )) + + for resource in row.get('html_resources', []): + timestamp = resource['timestamp'] + if not "+" in timestamp and not "Z" in timestamp: + timestamp += "Z" + wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( + surt=resource['surt'], + timestamp=timestamp, + url=resource['url'], + mimetype=resource.get('mimetype'), + size=resource.get('size'), + sha1=resource.get('sha1hex'), + sha256=resource.get('sha256hex'), + )) + + wc = fatcat_openapi_client.WebcaptureEntity( + cdx=wc_cdx, + archive_urls=archive_urls, + original_url=terminal['terminal_url'], + timestamp=terminal['terminal_timestamp'], + release_ids=[release_ident], + ) + + edit_extra = self.parse_edit_extra(row) + + if edit_extra: + wc.edit_extra = edit_extra + return wc + + def try_update(self, wc): + + # check for existing edits-in-progress with same URL + for other in self._entity_queue: + if other.original_url == wc.original_url: + self.counts['skip-in-queue'] += 1 + return False + + # lookup sha1, or create new entity (TODO: API doesn't support this yet) + #existing = None + + # TODO: currently only allow one release per webcapture + release = self.api.get_release(wc.release_ids[0], expand="webcaptures") + if release.webcaptures: + # check if this is an existing match, or just a similar hit + for other in release.webcaptures: + if wc.original_url == other.original_url: + # TODO: compare very similar timestamps of same time (different formats) + self.counts['exists'] += 1 + return False + self.counts['skip-release-has-webcapture'] += 1 + return False + + # Ok, if we got here then no existing web capture for (first) release, + # so go ahead and insert! + return True + + def insert_batch(self, batch): + self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 10a90dba..9ee641fa 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -5,6 +5,13 @@ free-form input, titles, etc. """ import re +import base64 +from typing import Optional +import unicodedata + +import ftfy +import langdetect +import pycountry DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$") @@ -55,7 +62,8 @@ def clean_doi(raw): # will likely want to expand DOI_REGEX to exclude non-ASCII characters, but # for now block specific characters so we can get PubMed importer running # again. - if 'ä' in raw or '\u200e' in raw: + # known characters to skip: ä á \u200e \u2043 \u2012 + if not raw.isascii(): return None return raw @@ -72,6 +80,10 @@ def test_clean_doi(): assert clean_doi("10.4149/gpb¬_2017042") == None # "logical negation" character assert clean_doi("10.6002/ect.2020.häyry") == None # this example via pubmed (pmid:32519616) assert clean_doi("10.30466/vrf.2019.98547.2350\u200e") == None + assert clean_doi("10.12016/j.issn.2096⁃1456.2017.06.014") == None + assert clean_doi("10.4025/diálogos.v17i2.36030") == None + assert clean_doi("10.19027/jai.10.106‒115") == None + assert clean_doi("10.15673/атбп2312-3125.17/2014.26332") == None ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$") @@ -233,3 +245,322 @@ def test_clean_orcid(): assert clean_orcid("0123-4567-3456-6789 ") == "0123-4567-3456-6789" assert clean_orcid("01234567-3456-6780") == None assert clean_orcid("0x23-4567-3456-6780") == None + + +def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]: + """ + This function is appropriate to be called on any random, non-markup string, + such as author names, titles, etc. + + It will try to clean up common unicode mangles, HTML characters, etc. + + This will detect XML/HTML and "do the right thing" (aka, not remove + entities like '&' if there are tags in the string), unless you pass the + 'force_xml' parameter, which might be appropriate for, eg, names and + titles, which generally should be projected down to plain text. + + Also strips extra whitespace. + """ + if not thing: + return None + fix_entities = 'auto' + if force_xml: + fix_entities = True + fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip() + if not fixed or len(fixed) <= 1: + # wasn't zero-length before, but is now; return None + return None + return fixed + +def test_clean_str(): + + assert clean_str(None) == None + assert clean_str('') == None + assert clean_str('1') == None + assert clean_str('123') == '123' + assert clean_str('a&b') == 'a&b' + assert clean_str('<b>a&b</b>') == '<b>a&b</b>' + assert clean_str('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' + +def b32_hex(s): + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + return s + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + +def is_cjk(s): + if not s: + return False + for c in s: + if c.isalpha(): + lang_prefix = unicodedata.name(c).split()[0] + return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL') + return False + +def test_is_cjk(): + assert is_cjk(None) is False + assert is_cjk('') is False + assert is_cjk('blah') is False + assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True + assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True + assert is_cjk('菊') is True + assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True + assert is_cjk('水道') is True + assert is_cjk('オウ, イク') is True # kanji + assert is_cjk('ひヒ') is True + assert is_cjk('き゚ゅ') is True + assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True + +MONTH_MAP = { + "jan": 1, "january": 1, + "feb": 2, "febuary": 2, + "mar": 3, "march": 3, + "apr": 4, "april": 4, + "may": 5, "may": 5, + "jun": 6, "june": 6, + "jul": 7, "july": 7, + "aug": 8, "august": 8, + "sep": 9, "september": 9, + "oct": 10, "october": 10, + "nov": 11, "nov": 11, + "dec": 12, "december": 12, +} + +def parse_month(raw: Optional[str]) -> Optional[int]: + """ + Parses a string into a month number (1 to 12) + """ + if not raw: + return None + raw = raw.strip().lower() + if raw.isdigit(): + raw_int = int(raw) + if raw_int >= 1 and raw_int <= 12: + return raw_int + else: + return None + if raw in MONTH_MAP: + return MONTH_MAP[raw] + return None + +def test_parse_month() -> None: + + assert parse_month(None) == None + assert parse_month("") == None + assert parse_month("0") == None + assert parse_month("10") == 10 + assert parse_month("jan") == 1 + assert parse_month("September") == 9 + +def detect_text_lang(raw: str) -> Optional[str]: + """ + Tries to determine language of, eg, an abstract. + + Returns an ISO 631 2-char language code, or None. + """ + if not raw: + return None + try: + lang = langdetect.detect(raw) + lang = lang.split('-')[0] + assert len(lang) == 2 + return lang + except (langdetect.lang_detect_exception.LangDetectException, TypeError): + return None + return None + +def test_detect_text_lang() -> None: + assert detect_text_lang("") == None + EN_SAMPLE = "this is a string of English text for testing" + assert detect_text_lang(EN_SAMPLE) == "en" + JA_SAMPLE = "モーラの種類は、以下に示すように111程度存在する。ただし、研究者により数え方が少しずつ異なる。" + assert detect_text_lang(JA_SAMPLE) == "ja" + ZH_SAMPLE = "随着分布式清洁能源的普及,通信技术在协调各个分布式电源的控制中显得尤为重要。在电力信息传输的过程中,不同的网络状态下表现出不同的通信特性,严重的甚至会发生信息错乱丢包等行为,这对电网的实时控制产生严重影响。为研究信息系统对电力物理系统的实时影响,搭建了电力信息物理融合仿真平台,运用RT-LAB与OPNET两款实时仿真器,通过TCP/IP进行数据交互,对微电网电压、频率的集中式恢复与分布式恢复问题展开研究。仿真结果表明,该平台能有效地反映通信网络对电网控制的影响,提供了一种可靠的未来电力信息物理融合系统研究技术。随着分布式清洁能源的普及,通信技术在协调各个分布式电源的控制中显得尤为重要。在电力信息传输的过程中,不同的网络状态下表现出不同的通信特性,严重的甚至会发生信息错乱丢包等行为,这对电网的实时控制产生严重影响。为研究信息系统对电力物理系统的实时影响,搭建了电力信息物理融合仿真平台,运用RT-LAB与OPNET两款实时仿真器,通过TCP/IP进行数据交互,对微电网电压、频率的集中式恢复与分布式恢复问题展开研究。仿真结果表明,该平台能有效地反映通信网络对电网控制的影响,提供了一种可靠的未来电力信息物理融合系统研究技术。" + # XXX: why does this detect as `ko` sometimes? + assert detect_text_lang(ZH_SAMPLE) in ("zh", "ko") + +def parse_lang_name(raw: Optional[str]) -> Optional[str]: + """ + Parses a language name and returns a 2-char ISO 631 language code. + """ + if not raw: + return None + try: + lang = pycountry.languages.lookup(raw) + if lang.alpha_3 in ("mul", "mis"): + return None + return lang.alpha_2.lower() + except LookupError: + #print(f" unknown language: '{raw}', file=sys.stderr) + return None + except AttributeError: + #print(f" partial language metadata: '{lang}', file=sys.stderr) + return None + return None + +def test_parse_lang_name() -> None: + + assert parse_lang_name(None) == None + assert parse_lang_name("") == None + assert parse_lang_name("asdf ") == None + assert parse_lang_name("english") == "en" + assert parse_lang_name("ENGLISH") == "en" + assert parse_lang_name("asdf blah") is None + assert parse_lang_name("en") == "en" + assert parse_lang_name("EN") == "en" + assert parse_lang_name("ENG") == "en" + assert parse_lang_name("English") == "en" + assert parse_lang_name("Portuguese") == "pt" + + +def parse_country_name(s: Optional[str]) -> Optional[str]: + """ + Parses a country name into a ISO country code (2-char). + + This version copied from the chocula repository. + """ + if not s or s in ("Unknown"): + return None + + s = s.strip() + if s.lower() in ("usa", "new york (state)", "washington (state)"): + return "us" + if s.lower() in ("russia (federation)", "russia"): + return "ru" + if s == "Québec (Province)": + s = "Canada" + if s == "China (Republic : 1949- )": + return "tw" + if s == "Brunei": + return "bn" + if s.startswith("Congo "): + s = "Congo" + if s.lower() == "iran": + return "ir" + if s.lower() == "bermuda islands": + return "bm" + if s.lower() == "burma": + s = "myanmar" + if s.lower() in ("korea (south)", "south korea"): + return "kr" + if s.lower() in ("england", "scotland", "wales"): + return "uk" + s = s.replace(" (Republic)", "").replace(" (Federation)", "") + + try: + country = pycountry.countries.lookup(s) + except LookupError: + country = None + + if country: + return country.alpha_2.lower() + try: + sub = pycountry.subdivisions.lookup(s) + except LookupError: + sub = None + + s = s.replace(" (State)", "").replace(" (Province)", "") + if sub: + return sub.country_code.lower() + + else: + # print(f"unknown country: {s}", file=sys.stderr) + return None + + +def test_parse_country_name(): + assert parse_country_name("") is None + assert parse_country_name("asdf blah") is None + assert parse_country_name("us") == "us" + assert parse_country_name("USA") == "us" + assert parse_country_name("United States of America") == "us" + assert parse_country_name("united States") == "us" + assert parse_country_name("Massachusetts") == "us" + assert parse_country_name("Russia") == "ru" + assert parse_country_name("Japan") == "jp" + +# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of +# 2/T and 2/B? +# PubMed/MEDLINE and JSTOR use these MARC codes +# https://www.loc.gov/marc/languages/language_name.html +LANG_MAP_MARC = { + 'afr': 'af', + 'alb': 'sq', + 'amh': 'am', + 'ara': 'ar', + 'arm': 'hy', + 'aze': 'az', + 'ben': 'bn', + 'bos': 'bs', + 'bul': 'bg', + 'cat': 'ca', + 'chi': 'zh', + 'cze': 'cs', + 'dan': 'da', + 'dut': 'nl', + 'eng': 'en', + 'epo': 'eo', + 'est': 'et', + 'fin': 'fi', + 'fre': 'fr', + 'geo': 'ka', + 'ger': 'de', + 'gla': 'gd', + 'gre': 'el', + 'heb': 'he', + 'hin': 'hi', + 'hrv': 'hr', + 'hun': 'hu', + 'ice': 'is', + 'ind': 'id', + 'ita': 'it', + 'jpn': 'ja', + 'kin': 'rw', + 'kor': 'ko', + 'lat': 'la', + 'lav': 'lv', + 'lit': 'lt', + 'mac': 'mk', + 'mal': 'ml', + 'mao': 'mi', + 'may': 'ms', + 'nor': 'no', + 'per': 'fa', + 'per': 'fa', + 'pol': 'pl', + 'por': 'pt', + 'pus': 'ps', + 'rum': 'ro', + 'rus': 'ru', + 'san': 'sa', + 'slo': 'sk', + 'slv': 'sl', + 'spa': 'es', + 'srp': 'sr', + 'swe': 'sv', + 'tha': 'th', + 'tur': 'tr', + 'ukr': 'uk', + 'urd': 'ur', + 'vie': 'vi', + 'wel': 'cy', + +# additions + 'gle': 'ga', # "Irish" (Gaelic) + 'jav': 'jv', # Javanese + 'welsh': 'cy', # Welsh + 'oci': 'oc', # Occitan + +# Don't have ISO 639-1 codes + 'grc': 'el', # Ancient Greek; map to modern greek + 'map': None, # Austronesian (collection) + 'syr': None, # Syriac, Modern + 'gem': None, # Old Saxon + 'non': None, # Old Norse + 'emg': None, # Eastern Meohang + 'neg': None, # Negidal + 'mul': None, # Multiple languages + 'und': None, # Undetermined +} diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index dfb5f799..ad4b7722 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,10 +1,13 @@ import datetime +from typing import Optional import tldextract +from fatcat_openapi_client import ReleaseEntity, ContainerEntity -def check_kbart(year, archive): + +def check_kbart(year: int, archive: dict) -> Optional[bool]: if not archive or not archive.get('year_spans'): return None for span in archive['year_spans']: @@ -12,7 +15,7 @@ def check_kbart(year, archive): return True return False -def test_check_kbart(): +def test_check_kbart() -> None: assert check_kbart(1990, dict()) is None assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) is False @@ -21,10 +24,13 @@ def test_check_kbart(): assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) is True -def release_to_elasticsearch(entity, force_bool=True): +def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) -> dict: """ Converts from an entity model/schema to elasticsearch oriented schema. + This is a large/complex transform, so subsets are split out into helper + functions. + Returns: dict Raises exception on error (never returns None) """ @@ -68,16 +74,18 @@ def release_to_elasticsearch(entity, force_bool=True): mag_id = release.ext_ids.mag, ) - is_oa = None - is_preserved = None - is_longtail_oa = None - in_kbart = None - in_jstor = False - in_web = False - in_dweb = False - in_ia = False - in_ia_sim = False - in_shadows = False + t.update(dict( + is_oa = None, + is_longtail_oa = None, + is_preserved = None, + in_web = False, + in_dweb = False, + in_ia = False, + in_ia_sim = False, + in_kbart = None, + in_jstor = False, + in_shadows = False, + )) release_year = release.release_year if release.release_date: @@ -116,55 +124,8 @@ def release_to_elasticsearch(entity, force_bool=True): # TODO: mapping... probably by lookup? t['affiliation_rors'] = None - this_year = datetime.date.today().year - container = release.container - if container: - t['publisher'] = container.publisher - t['container_name'] = container.name - # this is container.ident, not release.container_id, because there may - # be a redirect involved - t['container_id'] = container.ident - t['container_issnl'] = container.issnl - t['container_type'] = container.container_type - if container.extra: - c_extra = container.extra - if c_extra.get('kbart') and release_year: - in_jstor = check_kbart(release_year, c_extra['kbart'].get('jstor')) - in_kbart = in_jstor - for archive in ('portico', 'lockss', 'clockss', 'pkp_pln', - 'hathitrust', 'scholarsportal', 'cariniana'): - in_kbart = in_kbart or check_kbart(release_year, c_extra['kbart'].get(archive)) - # recent KBART coverage is often not updated for the - # current year. So for current-year publications, consider - # coverage from *last* year to also be included in the - # Keeper - if not in_kbart and release_year == this_year: - in_kbart = check_kbart(this_year - 1, c_extra['kbart'].get(archive)) - - if c_extra.get('ia'): - if c_extra['ia'].get('sim') and release_year: - in_ia_sim = check_kbart(release_year, c_extra['ia']['sim']) - if c_extra['ia'].get('longtail_oa'): - is_longtail_oa = True - if c_extra.get('sherpa_romeo'): - if c_extra['sherpa_romeo'].get('color') == 'white': - is_oa = False - if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'): - is_oa = True - if c_extra.get('doaj'): - if c_extra['doaj'].get('as_of'): - is_oa = True - if c_extra.get('road'): - if c_extra['road'].get('as_of'): - is_oa = True - if c_extra.get('szczepanski'): - if c_extra['szczepanski'].get('as_of'): - is_oa = True - if c_extra.get('country'): - t['country_code'] = c_extra['country'] - t['country_code_upper'] = c_extra['country'].upper() - if c_extra.get('publisher_type'): - t['publisher_type'] = c_extra['publisher_type'] + if release.container: + t.update(_rte_container_helper(release.container, release_year)) # fall back to release-level container metadata if container not linked or # missing context @@ -174,67 +135,36 @@ def release_to_elasticsearch(entity, force_bool=True): t['container_name'] = release.extra.get('container_name') if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')): - in_jstor = True + t['in_jstor'] = True - files = release.files or [] - t['file_count'] = len(files) - t['fileset_count'] = len(release.filesets or []) - t['webcapture_count'] = len(release.webcaptures or []) - any_pdf_url = None - good_pdf_url = None - best_pdf_url = None - ia_pdf_url = None - for f in files: - if f.extra and f.extra.get('shadows'): - # TODO: shadow check goes here - in_shadows = True - is_pdf = 'pdf' in (f.mimetype or '') - for release_url in (f.urls or []): - if not f.mimetype and 'pdf' in release_url.url.lower(): - is_pdf = True - if release_url.url.lower().startswith('http'): - in_web = True - if release_url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): - # not sure what rel will be for this stuff - in_dweb = True - if is_pdf: - any_pdf_url = release_url.url - if is_pdf and release_url.rel in ('webarchive', 'repository') and is_pdf: - is_preserved = True - good_pdf_url = release_url.url - if '//www.jstor.org/' in release_url.url: - in_jstor = True - if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url: - in_ia = True - if is_pdf: - best_pdf_url = release_url.url - ia_pdf_url = release_url.url - # here is where we bake-in priority; IA-specific - t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url - t['ia_pdf_url'] = ia_pdf_url + # transform file/fileset/webcapture related fields + t.update(_rte_content_helper(release)) + + if release.ext_ids.doaj: + t['is_oa'] = True if release.license_slug: # TODO: more/better checks here, particularly strict *not* OA licenses if release.license_slug.startswith("CC-"): - is_oa = True + t['is_oa'] = True if release.license_slug.startswith("ARXIV-"): - is_oa = True + t['is_oa'] = True extra = release.extra or dict() if extra: if extra.get('is_oa'): # NOTE: not actually setting this anywhere... but could - is_oa = True + t['is_oa'] = True if extra.get('longtail_oa'): # sometimes set by GROBID/matcher - is_oa = True - is_longtail_oa = True + t['is_oa'] = True + t['is_longtail_oa'] = True if not t.get('container_name'): t['container_name'] = extra.get('container_name') if extra.get('crossref'): if extra['crossref'].get('archive'): # all crossref archives are KBART, I believe - in_kbart = True + t['in_kbart'] = True # backwards compatible subtitle fetching if not t['subtitle'] and extra.get('subtitle'): if type(extra['subtitle']) == list: @@ -251,7 +181,7 @@ def release_to_elasticsearch(entity, force_bool=True): # TODO: non-numerical first pages t['ia_microfilm_url'] = None - if in_ia_sim: + if t['in_ia_sim']: # TODO: determine URL somehow? I think this is in flux. Will probably # need extra metadata in the container extra field. # special case as a demo for now. @@ -277,42 +207,168 @@ def release_to_elasticsearch(entity, force_bool=True): if t['doi']: t['doi_prefix'] = t['doi'].split('/')[0] - if is_longtail_oa: - is_oa = True + if t['is_longtail_oa']: + t['is_oa'] = True + # optionally coerce all flags from Optional[bool] to bool if force_bool: - t['is_oa'] = bool(is_oa) - t['is_longtail_oa'] = bool(is_longtail_oa) - t['in_kbart'] = bool(in_kbart) - t['in_ia_sim'] = bool(in_ia_sim) - t['in_jstor'] = bool(in_jstor) - t['in_web'] = bool(in_web) - t['in_dweb'] = bool(in_dweb) - t['in_shadows'] = bool(in_shadows) - else: - t['is_oa'] = is_oa - t['is_longtail_oa'] = is_longtail_oa - t['in_kbart'] = in_kbart - t['in_ia_sim'] = in_ia_sim - t['in_jstor'] = in_jstor - t['in_web'] = in_web - t['in_dweb'] = in_dweb - t['in_shadows'] = in_shadows - - t['in_ia'] = bool(in_ia) - t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id')) + for k in ('is_oa', 'is_longtail_oa', 'in_kbart', 'in_ia_sim', + 'in_jstor', 'in_web', 'in_dweb', 'in_shadows'): + t[k] = bool(t[k]) + + t['in_ia'] = bool(t['in_ia']) + t['is_preserved'] = ( + bool(t['is_preserved']) + or t['in_ia'] + or t['in_kbart'] + or t['in_jstor'] + or t.get('pmcid') + or t.get('arxiv_id') + ) - if in_ia: + if t['in_ia']: t['preservation'] = 'bright' - elif in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id'): + elif t['is_preserved']: t['preservation'] = 'dark' - elif in_shadows: + elif t['in_shadows']: t['preservation'] = 'shadows_only' else: t['preservation'] = 'none' return t +def _rte_container_helper(container: ContainerEntity, release_year: Optional[int]) -> dict: + """ + Container metadata sub-section of release_to_elasticsearch() + """ + this_year = datetime.date.today().year + t = dict() + t['publisher'] = container.publisher + t['container_name'] = container.name + # this is container.ident, not release.container_id, because there may + # be a redirect involved + t['container_id'] = container.ident + t['container_issnl'] = container.issnl + t['container_type'] = container.container_type + if container.extra: + c_extra = container.extra + if c_extra.get('kbart') and release_year: + if check_kbart(release_year, c_extra['kbart'].get('jstor')): + t['in_jstor'] = True + if t.get('in_kbart') or t.get('in_jstor'): + t['in_kbart'] = True + for archive in ('portico', 'lockss', 'clockss', 'pkp_pln', + 'hathitrust', 'scholarsportal', 'cariniana'): + t['in_kbart'] = t.get('in_kbart') or check_kbart(release_year, c_extra['kbart'].get(archive)) + # recent KBART coverage is often not updated for the + # current year. So for current-year publications, consider + # coverage from *last* year to also be included in the + # Keeper + if not t.get('in_kbart') and release_year == this_year: + t['in_kbart'] = check_kbart(this_year - 1, c_extra['kbart'].get(archive)) + + if c_extra.get('ia'): + if c_extra['ia'].get('sim') and release_year: + t['in_ia_sim'] = check_kbart(release_year, c_extra['ia']['sim']) + if c_extra['ia'].get('longtail_oa'): + t['is_longtail_oa'] = True + if c_extra.get('sherpa_romeo'): + if c_extra['sherpa_romeo'].get('color') == 'white': + t['is_oa'] = False + if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'): + t['is_oa'] = True + if c_extra.get('doaj'): + if c_extra['doaj'].get('as_of'): + t['is_oa'] = True + if c_extra.get('road'): + if c_extra['road'].get('as_of'): + t['is_oa'] = True + if c_extra.get('szczepanski'): + if c_extra['szczepanski'].get('as_of'): + t['is_oa'] = True + if c_extra.get('country'): + t['country_code'] = c_extra['country'] + t['country_code_upper'] = c_extra['country'].upper() + if c_extra.get('publisher_type'): + t['publisher_type'] = c_extra['publisher_type'] + return t + +def _rte_content_helper(release: ReleaseEntity) -> dict: + """ + File/FileSet/WebCapture sub-section of release_to_elasticsearch() + + The current priority order for "best_pdf_url" is: + - internet archive urls (archive.org or web.archive.org) + - other webarchive or repository URLs + - any other URL + """ + t = dict( + file_count = len(release.files or []), + fileset_count = len(release.filesets or []), + webcapture_count = len(release.webcaptures or []), + ) + + any_pdf_url = None + good_pdf_url = None + best_pdf_url = None + ia_pdf_url = None + + for f in release.files or []: + if f.extra and f.extra.get('shadows'): + t['in_shadows'] = True + is_pdf = 'pdf' in (f.mimetype or '') + for release_url in (f.urls or []): + # first generic flags + t.update(_rte_url_helper(release_url)) + + # then PDF specific stuff (for generating "best URL" fields) + if not f.mimetype and 'pdf' in release_url.url.lower(): + is_pdf = True + if is_pdf: + any_pdf_url = release_url.url + if release_url.rel in ('webarchive', 'repository', 'repo'): + good_pdf_url = release_url.url + if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url: + best_pdf_url = release_url.url + ia_pdf_url = release_url.url + + # here is where we bake-in PDF url priority; IA-specific + t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url + t['ia_pdf_url'] = ia_pdf_url + + for fs in release.filesets or []: + for url_obj in (fs.urls or []): + t.update(_rte_url_helper(url_obj)) + + for wc in release.webcaptures or []: + for url_obj in (wc.archive_urls or []): + t.update(_rte_url_helper(url_obj)) + + return t + +def _rte_url_helper(url_obj) -> dict: + """ + Takes a location URL ('url' and 'rel' keys) and returns generic preservation status. + + Designed to work with file, webcapture, or fileset URLs. + + Returns a dict; should *not* include non-True values for any keys because + these will be iteratively update() into the overal object. + """ + t = dict() + if url_obj.rel in ('webarchive', 'repository', 'archive', 'repo'): + t['is_preserved'] = True + if '//web.archive.org/' in url_obj.url or '//archive.org/' in url_obj.url: + t['in_ia'] = True + if url_obj.url.lower().startswith('http') or url_obj.url.lower().startswith('ftp'): + t['in_web'] = True + if url_obj.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): + # not sure what rel will be for this stuff + t['in_dweb'] = True + if '//www.jstor.org/' in url_obj.url: + t['in_jstor'] = True + return t + def container_to_elasticsearch(entity, force_bool=True): """ diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index 2f4e2271..59831017 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -15,15 +15,19 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= if release.state != 'active': return None + # TODO: infer ingest type based on release_type or container metadata? + if not ingest_type: + ingest_type = 'pdf' + # generate a URL where we expect to find fulltext url = None link_source = None link_source_id = None - if release.ext_ids.arxiv: + if release.ext_ids.arxiv and ingest_type == "pdf": url = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv) link_source = "arxiv" link_source_id = release.ext_ids.arxiv - elif release.ext_ids.pmcid: + elif release.ext_ids.pmcid and ingest_type == "pdf": # TODO: how to tell if an author manuscript in PMC vs. published? #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid) url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid) @@ -40,10 +44,6 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= ext_ids = release.ext_ids.to_dict() ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) - # TODO: infer ingest type based on release_type or container metadata? - if not ingest_type: - ingest_type = 'pdf' - ingest_request = { 'ingest_type': ingest_type, 'ingest_request_source': ingest_request_source, diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 2111a20d..94791770 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -326,6 +326,8 @@ class EntityUpdatesWorker(FatcatWorker): release_ids = [] new_release_ids = [] file_ids = [] + fileset_ids = [] + webcapture_ids = [] container_ids = [] work_ids = [] release_edits = cle['editgroup']['edits']['releases'] @@ -337,6 +339,12 @@ class EntityUpdatesWorker(FatcatWorker): file_edits = cle['editgroup']['edits']['files'] for e in file_edits: file_ids.append(e['ident']) + fileset_edits = cle['editgroup']['edits']['filesets'] + for e in fileset_edits: + fileset_ids.append(e['ident']) + webcapture_edits = cle['editgroup']['edits']['webcaptures'] + for e in webcapture_edits: + webcapture_ids.append(e['ident']) container_edits = cle['editgroup']['edits']['containers'] for e in container_edits: container_ids.append(e['ident']) @@ -348,8 +356,8 @@ class EntityUpdatesWorker(FatcatWorker): for ident in set(file_ids): file_entity = self.api.get_file(ident, expand=None) # update release when a file changes - # TODO: fetch old revision as well, and only update - # releases for which list changed + # TODO: also fetch old version of file and update any *removed* + # release idents (and same for filesets, webcapture updates) release_ids.extend(file_entity.release_ids or []) file_dict = self.api.api_client.sanitize_for_serialization(file_entity) producer.produce( @@ -358,6 +366,19 @@ class EntityUpdatesWorker(FatcatWorker): key=ident.encode('utf-8'), on_delivery=fail_fast, ) + + # TODO: topic for fileset updates + for ident in set(fileset_ids): + fileset_entity = self.api.get_fileset(ident, expand=None) + # update release when a fileset changes + release_ids.extend(file_entity.release_ids or []) + + # TODO: topic for webcapture updates + for ident in set(webcapture_ids): + webcapture_entity = self.api.get_webcapture(ident, expand=None) + # update release when a webcapture changes + release_ids.extend(webcapture_entity.release_ids or []) + for ident in set(container_ids): container = self.api.get_container(ident) container_dict = self.api.api_client.sanitize_for_serialization(container) @@ -367,6 +388,7 @@ class EntityUpdatesWorker(FatcatWorker): key=ident.encode('utf-8'), on_delivery=fail_fast, ) + for ident in set(release_ids): release = self.api.get_release(ident, expand="files,filesets,webcaptures,container") if release.work_id: @@ -378,7 +400,7 @@ class EntityUpdatesWorker(FatcatWorker): key=ident.encode('utf-8'), on_delivery=fail_fast, ) - # filter to "new" active releases with no matched files + # for ingest requests, filter to "new" active releases with no matched files if release.ident in new_release_ids: ir = release_ingest_request(release, ingest_request_source='fatcat-changelog') if ir and not release.files and self.want_live_ingest(release, ir): |