summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/README_import.md7
-rwxr-xr-xpython/fatcat_import.py85
-rwxr-xr-xpython/fatcat_ingest.py4
-rw-r--r--python/fatcat_tools/importers/__init__.py3
-rw-r--r--python/fatcat_tools/importers/common.py158
-rw-r--r--python/fatcat_tools/importers/crossref.py2
-rw-r--r--python/fatcat_tools/importers/datacite.py10
-rw-r--r--python/fatcat_tools/importers/doaj_article.py358
-rw-r--r--python/fatcat_tools/importers/ingest.py329
-rw-r--r--python/fatcat_tools/normal.py333
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py318
-rw-r--r--python/fatcat_tools/transforms/ingest.py12
-rw-r--r--python/fatcat_tools/workers/changelog.py28
-rw-r--r--python/fatcat_web/templates/release_view.html8
-rw-r--r--python/tests/files/example_doaj_articles.json5
-rw-r--r--python/tests/files/example_ingest.json2
-rw-r--r--python/tests/files/example_ingest_html.json1
-rw-r--r--python/tests/files/example_ingest_xml.json1
-rw-r--r--python/tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json1
-rw-r--r--python/tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json1
-rw-r--r--python/tests/import_doaj.py142
-rw-r--r--python/tests/import_ingest.py68
-rw-r--r--python/tests/transform_elasticsearch.py95
23 files changed, 1600 insertions, 371 deletions
diff --git a/python/README_import.md b/python/README_import.md
index 65c08f8b..71b15eee 100644
--- a/python/README_import.md
+++ b/python/README_import.md
@@ -126,3 +126,10 @@ Run import in parallel:
zcat /srv/fatcat/datasets/crossref-pre-1923-scholarly-works.matched.json.gz | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched - --default-mime 'application/pdf'
+## DOAJ
+
+Takes a few hours.
+
+ export FATCAT_API_AUTH_TOKEN=... (FATCAT_AUTH_WORKER_DOAJ)
+
+ zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13_all.json.gz | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index e92b3106..ff6c94dc 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -126,7 +126,7 @@ def run_arabesque_match(args):
def run_ingest_file(args):
ifri = IngestFileResultImporter(args.api,
editgroup_description=args.editgroup_description_override,
- skip_source_whitelist=args.skip_source_whitelist,
+ skip_source_allowlist=args.skip_source_allowlist,
do_updates=args.do_updates,
default_link_rel=args.default_link_rel,
require_grobid=(not args.no_require_grobid),
@@ -144,6 +144,26 @@ def run_ingest_file(args):
else:
JsonLinePusher(ifri, args.json_file).run()
+def run_ingest_web(args):
+ iwri = IngestWebResultImporter(args.api,
+ editgroup_description=args.editgroup_description_override,
+ skip_source_allowlist=args.skip_source_allowlist,
+ do_updates=args.do_updates,
+ default_link_rel=args.default_link_rel,
+ edit_batch_size=args.batch_size)
+ if args.kafka_mode:
+ KafkaJsonPusher(
+ iwri,
+ args.kafka_hosts,
+ args.kafka_env,
+ "ingest-file-results",
+ "fatcat-{}-ingest-web-result".format(args.kafka_env),
+ kafka_namespace="sandcrawler",
+ consume_batch_size=args.batch_size,
+ ).run()
+ else:
+ JsonLinePusher(iwri, args.json_file).run()
+
def run_savepapernow_file(args):
ifri = SavePaperNowFileImporter(args.api,
editgroup_description=args.editgroup_description_override,
@@ -236,6 +256,24 @@ def run_datacite(args):
else:
JsonLinePusher(dci, args.json_file).run()
+def run_doaj_article(args):
+ dai = DoajArticleImporter(args.api,
+ args.issn_map_file,
+ edit_batch_size=args.batch_size,
+ do_updates=args.do_updates,
+ )
+ if args.kafka_mode:
+ KafkaJsonPusher(
+ dai,
+ args.kafka_hosts,
+ args.kafka_env,
+ "api-doaj",
+ "fatcat-{}-import-doaj".format(args.kafka_env),
+ consume_batch_size=args.batch_size,
+ ).run()
+ else:
+ JsonLinePusher(dai, args.json_file).run()
+
def run_file_meta(args):
# do_updates defaults to true for this importer
fmi = FileMetaImporter(args.api,
@@ -442,9 +480,9 @@ def main():
sub_ingest_file.add_argument('json_file',
help="ingest_file JSON file to import from",
default=sys.stdin, type=argparse.FileType('r'))
- sub_ingest_file.add_argument('--skip-source-whitelist',
+ sub_ingest_file.add_argument('--skip-source-allowlist',
action='store_true',
- help="don't filter import based on request source whitelist")
+ help="don't filter import based on request source allowlist")
sub_ingest_file.add_argument('--kafka-mode',
action='store_true',
help="consume from kafka topic (not stdin)")
@@ -458,6 +496,28 @@ def main():
default="web",
help="default URL rel for matches (eg, 'publisher', 'web')")
+ sub_ingest_web = subparsers.add_parser('ingest-web-results',
+ help="add/update web entities linked to releases based on sandcrawler ingest results")
+ sub_ingest_web.set_defaults(
+ func=run_ingest_web,
+ auth_var="FATCAT_AUTH_WORKER_CRAWL",
+ )
+ sub_ingest_web.add_argument('json_file',
+ help="ingest_web JSON file to import from",
+ default=sys.stdin, type=argparse.FileType('r'))
+ sub_ingest_web.add_argument('--skip-source-allowlist',
+ action='store_true',
+ help="don't filter import based on request source allowlist")
+ sub_ingest_web.add_argument('--kafka-mode',
+ action='store_true',
+ help="consume from kafka topic (not stdin)")
+ sub_ingest_web.add_argument('--do-updates',
+ action='store_true',
+ help="update pre-existing web entities if new match (instead of skipping)")
+ sub_ingest_web.add_argument('--default-link-rel',
+ default="web",
+ help="default URL rel for matches (eg, 'publisher', 'web')")
+
sub_savepapernow_file = subparsers.add_parser('savepapernow-file-results',
help="add file entities crawled due to async Save Paper Now request")
sub_savepapernow_file.set_defaults(
@@ -564,6 +624,25 @@ def main():
auth_var="FATCAT_AUTH_WORKER_DATACITE",
)
+ sub_doaj_article = subparsers.add_parser('doaj-article',
+ help="import doaj.org article metadata")
+ sub_doaj_article.add_argument('json_file',
+ help="File with JSON lines from DOAJ API (or bulk dump) to import from",
+ default=sys.stdin, type=argparse.FileType('r'))
+ sub_doaj_article.add_argument('--issn-map-file',
+ help="ISSN to ISSN-L mapping file",
+ default=None, type=argparse.FileType('r'))
+ sub_doaj_article.add_argument('--kafka-mode',
+ action='store_true',
+ help="consume from kafka topic (not stdin)")
+ sub_doaj_article.add_argument('--do-updates',
+ action='store_true',
+ help="update any pre-existing release entities")
+ sub_doaj_article.set_defaults(
+ func=run_doaj_article,
+ auth_var="FATCAT_AUTH_WORKER_DOAJ",
+ )
+
sub_file_meta = subparsers.add_parser('file-meta',
help="simple update-only importer for file metadata")
sub_file_meta.set_defaults(
diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py
index 68676ad2..b9d71a7c 100755
--- a/python/fatcat_ingest.py
+++ b/python/fatcat_ingest.py
@@ -87,6 +87,7 @@ def _run_search_dump(args, search):
ingest_request = release_ingest_request(
release,
ingest_request_source="fatcat-ingest",
+ ingest_type=args.ingest_type,
)
if not ingest_request:
continue
@@ -214,6 +215,9 @@ def main():
parser.add_argument('--force-recrawl',
action='store_true',
help="Tell ingest worker to skip GWB history lookup and do SPNv2 crawl")
+ parser.add_argument('--ingest-type',
+ default="pdf",
+ help="What medium to ingest (pdf, xml, html)")
subparsers = parser.add_subparsers()
sub_container = subparsers.add_parser('container',
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index b82eb11a..d2928d09 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -27,6 +27,7 @@ from .orcid import OrcidImporter
from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE
from .wayback_static import auto_wayback_static
from .cdl_dash_dat import auto_cdl_dash_dat
-from .ingest import IngestFileResultImporter, SavePaperNowFileImporter
+from .ingest import IngestFileResultImporter, SavePaperNowFileImporter, IngestWebResultImporter
from .shadow import ShadowLibraryImporter
from .file_meta import FileMetaImporter
+from .doaj_article import DoajArticleImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 14415683..3c810391 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -3,12 +3,9 @@ import re
import sys
import csv
import json
-import ftfy
-import base64
import sqlite3
import datetime
import subprocess
-import unicodedata
from collections import Counter
from confluent_kafka import Consumer, KafkaException
import xml.etree.ElementTree as ET
@@ -18,162 +15,13 @@ from bs4 import BeautifulSoup
import fatcat_openapi_client
from fatcat_openapi_client.rest import ApiException
+# TODO: refactor so remove need for this (re-imports for backwards compatibility)
+from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401
DATE_FMT = "%Y-%m-%d"
SANE_MAX_RELEASES = 200
SANE_MAX_URLS = 100
-# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
-# 2/T and 2/B?
-# PubMed/MEDLINE and JSTOR use these MARC codes
-# https://www.loc.gov/marc/languages/language_name.html
-LANG_MAP_MARC = {
- 'afr': 'af',
- 'alb': 'sq',
- 'amh': 'am',
- 'ara': 'ar',
- 'arm': 'hy',
- 'aze': 'az',
- 'ben': 'bn',
- 'bos': 'bs',
- 'bul': 'bg',
- 'cat': 'ca',
- 'chi': 'zh',
- 'cze': 'cs',
- 'dan': 'da',
- 'dut': 'nl',
- 'eng': 'en',
- 'epo': 'eo',
- 'est': 'et',
- 'fin': 'fi',
- 'fre': 'fr',
- 'geo': 'ka',
- 'ger': 'de',
- 'gla': 'gd',
- 'gre': 'el',
- 'heb': 'he',
- 'hin': 'hi',
- 'hrv': 'hr',
- 'hun': 'hu',
- 'ice': 'is',
- 'ind': 'id',
- 'ita': 'it',
- 'jpn': 'ja',
- 'kin': 'rw',
- 'kor': 'ko',
- 'lat': 'la',
- 'lav': 'lv',
- 'lit': 'lt',
- 'mac': 'mk',
- 'mal': 'ml',
- 'mao': 'mi',
- 'may': 'ms',
- 'nor': 'no',
- 'per': 'fa',
- 'per': 'fa',
- 'pol': 'pl',
- 'por': 'pt',
- 'pus': 'ps',
- 'rum': 'ro',
- 'rus': 'ru',
- 'san': 'sa',
- 'slo': 'sk',
- 'slv': 'sl',
- 'spa': 'es',
- 'srp': 'sr',
- 'swe': 'sv',
- 'tha': 'th',
- 'tur': 'tr',
- 'ukr': 'uk',
- 'urd': 'ur',
- 'vie': 'vi',
- 'wel': 'cy',
-
-# additions
- 'gle': 'ga', # "Irish" (Gaelic)
- 'jav': 'jv', # Javanese
- 'welsh': 'cy', # Welsh
- 'oci': 'oc', # Occitan
-
-# Don't have ISO 639-1 codes
- 'grc': 'el', # Ancient Greek; map to modern greek
- 'map': None, # Austronesian (collection)
- 'syr': None, # Syriac, Modern
- 'gem': None, # Old Saxon
- 'non': None, # Old Norse
- 'emg': None, # Eastern Meohang
- 'neg': None, # Negidal
- 'mul': None, # Multiple languages
- 'und': None, # Undetermined
-}
-
-
-def clean(thing, force_xml=False):
- """
- This function is appropriate to be called on any random, non-markup string,
- such as author names, titles, etc.
-
- It will try to clean up common unicode mangles, HTML characters, etc.
-
- This will detect XML/HTML and "do the right thing" (aka, not remove
- entities like '&amp' if there are tags in the string), unless you pass the
- 'force_xml' parameter, which might be appropriate for, eg, names and
- titles, which generally should be projected down to plain text.
-
- Also strips extra whitespace.
- """
- if not thing:
- return None
- fix_entities = 'auto'
- if force_xml:
- fix_entities = True
- fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip()
- if not fixed or len(fixed) <= 1:
- # wasn't zero-length before, but is now; return None
- return None
- return fixed
-
-def test_clean():
-
- assert clean(None) == None
- assert clean('') == None
- assert clean('1') == None
- assert clean('123') == '123'
- assert clean('a&amp;b') == 'a&b'
- assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
- assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
-
-def b32_hex(s):
- s = s.strip().split()[0].lower()
- if s.startswith("sha1:"):
- s = s[5:]
- if len(s) != 32:
- return s
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
-
-def is_cjk(s):
- if not s:
- return False
- for c in s:
- if c.isalpha():
- lang_prefix = unicodedata.name(c).split()[0]
- return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL')
- return False
-
-def test_is_cjk():
- assert is_cjk(None) is False
- assert is_cjk('') is False
- assert is_cjk('blah') is False
- assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True
- assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True
- assert is_cjk('菊') is True
- assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True
- assert is_cjk('水道') is True
- assert is_cjk('オウ, イク') is True # kanji
- assert is_cjk('ひヒ') is True
- assert is_cjk('き゚ゅ') is True
- assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True
-
DOMAIN_REL_MAP = {
"archive.org": "archive",
# LOCKSS, Portico, DuraSpace, etc would also be "archive"
@@ -444,6 +292,7 @@ class EntityImporter:
raise NotImplementedError
def is_orcid(self, orcid):
+ # TODO: replace with clean_orcid() from fatcat_tools.normal
return self._orcid_regex.match(orcid) is not None
def lookup_orcid(self, orcid):
@@ -464,6 +313,7 @@ class EntityImporter:
return creator_id
def is_doi(self, doi):
+ # TODO: replace with clean_doi() from fatcat_tools.normal
return doi.startswith("10.") and doi.count("/") >= 1
def lookup_doi(self, doi):
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 71f08952..e77fa65e 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -487,8 +487,6 @@ class CrossrefImporter(EntityImporter):
except fatcat_openapi_client.rest.ApiException as err:
if err.status != 404:
raise err
- # doesn't exist, need to update
- return True
# eventually we'll want to support "updates", but for now just skip if
# entity already exists
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 86740e80..70f8db86 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -151,7 +151,7 @@ UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set((
'Unknown',
)))
-# UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blacklist.
+# UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blocklist.
UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS))
# Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi
@@ -346,7 +346,7 @@ class DataciteImporter(EntityImporter):
print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
return False
- # check for blacklisted "spam", e.g. "FULL MOVIE"
+ # check for blocklisted "spam", e.g. "FULL MOVIE"
for rule in DATACITE_TITLE_SPAM_WORDGROUPS:
seen = set()
for token in rule.get("tokens", []):
@@ -781,8 +781,6 @@ class DataciteImporter(EntityImporter):
except fatcat_openapi_client.rest.ApiException as err:
if err.status != 404:
raise err
- # doesn't exist, need to update
- return True
# eventually we'll want to support "updates", but for now just skip if
# entity already exists
@@ -819,7 +817,7 @@ class DataciteImporter(EntityImporter):
contribs = []
# Names, that should be ignored right away.
- name_blacklist = set(('Occdownload Gbif.Org',))
+ name_blocklist = set(('Occdownload Gbif.Org',))
i = 0
for c in creators:
@@ -861,7 +859,7 @@ class DataciteImporter(EntityImporter):
continue
if not name:
name = "{} {}".format(given_name or '', surname or '').strip()
- if name in name_blacklist:
+ if name in name_blocklist:
continue
if name.lower() in UNKNOWN_MARKERS_LOWER:
continue
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
new file mode 100644
index 00000000..03752484
--- /dev/null
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -0,0 +1,358 @@
+"""
+Importer for DOAJ article-level metadata, schema v1.
+
+DOAJ API schema and docs: https://doaj.org/api/v1/docs
+"""
+
+import warnings
+import datetime
+from typing import List, Optional
+
+import fatcat_openapi_client
+from fatcat_tools.normal import (clean_doi, clean_str, parse_month,
+ clean_orcid, detect_text_lang, parse_lang_name, parse_country_name,
+ clean_pmid, clean_pmcid)
+from fatcat_tools.importers.common import EntityImporter
+
+# Cutoff length for abstracts.
+MAX_ABSTRACT_LENGTH = 2048
+
+
+class DoajArticleImporter(EntityImporter):
+
+ def __init__(self,
+ api,
+ issn_map_file,
+ **kwargs):
+
+ eg_desc = kwargs.get(
+ 'editgroup_description',
+ "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps"
+ )
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent',
+ 'fatcat_tools.DoajArticleImporter')
+ # ensure default is to not do updates with this worker (override super() default)
+ kwargs['do_updates'] = kwargs.get("do_updates", False)
+ super().__init__(api,
+ issn_map_file=issn_map_file,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+
+ self.this_year = datetime.datetime.now().year
+ self.read_issn_map_file(issn_map_file)
+
+ def want(self, obj):
+ return True
+
+ def parse_record(self, obj):
+ """
+ bibjson {
+ abstract (string, optional),
+ author (Array[bibjson.author], optional),
+ identifier (Array[bibjson.identifier]),
+ journal (bibjson.journal, optional),
+ keywords (Array[string], optional),
+ link (Array[bibjson.link], optional),
+ month (string, optional),
+ subject (Array[bibjson.subject], optional),
+ title (string),
+ year (string, optional)
+ }
+ bibjson.journal {
+ country (string, optional),
+ end_page (string, optional),
+ language (Array[string], optional),
+ license (Array[bibjson.journal.license], optional),
+ number (string, optional),
+ publisher (string, optional),
+ start_page (string, optional),
+ title (string, optional),
+ volume (string, optional)
+ }
+ """
+
+ if not obj or not isinstance(obj, dict) or not 'bibjson' in obj:
+ self.counts['skip-empty'] += 1
+ return None
+
+ bibjson = obj['bibjson']
+
+ title = clean_str(bibjson.get('title'), force_xml=True)
+ if not title:
+ self.counts['skip-title'] += 1
+ return False
+
+ container_name = clean_str(bibjson['journal']['title'])
+ container_id = None
+ # NOTE: 'issns' not documented in API schema
+ for issn in bibjson['journal']['issns']:
+ issnl = self.issn2issnl(issn)
+ if issnl:
+ container_id = self.lookup_issnl(self.issn2issnl(issn))
+ if container_id:
+ # don't store container_name when we have an exact match
+ container_name = None
+ break
+
+ volume = clean_str(bibjson['journal'].get('volume'))
+ # NOTE: this schema seems to use "number" as "issue number"
+ issue = clean_str(bibjson['journal'].get('number'))
+ publisher = clean_str(bibjson['journal'].get('publisher'))
+
+ try:
+ release_year = int(bibjson.get('year'))
+ except (TypeError, ValueError):
+ release_year = None
+ release_month = parse_month(clean_str(bibjson.get('month')))
+
+ # block bogus far-future years/dates
+ if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+ release_month = None
+ release_year = None
+
+ license_slug = self.doaj_license_slug(bibjson['journal'].get('license'))
+ country = parse_country_name(bibjson['journal'].get('country'))
+ language = None
+ for raw in bibjson['journal'].get('language') or []:
+ language = parse_lang_name(raw)
+ if language:
+ break
+
+ # pages
+ # NOTE: error in API docs? seems like start_page not under 'journal' object
+ start_page = clean_str(bibjson['journal'].get('start_page')) or clean_str(bibjson.get('start_page'))
+ end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str(bibjson.get('end_page'))
+ pages: Optional[str] = None
+ if start_page and end_page:
+ pages = f"{start_page}-{end_page}"
+ elif start_page:
+ pages = start_page
+
+ doaj_article_id = obj['id'].lower()
+ ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id)
+ abstracts = self.doaj_abstracts(bibjson)
+ contribs = self.doaj_contribs(bibjson.get('author') or [])
+
+ # DOAJ-specific extra
+ doaj_extra = dict()
+ if bibjson.get('subject'):
+ doaj_extra['subject'] = bibjson.get('subject')
+ if bibjson.get('keywords'):
+ doaj_extra['keywords'] = [k for k in [clean_str(s) for s in bibjson.get('keywords')] if k]
+
+ # generic extra
+ extra = dict()
+ if country:
+ extra['country'] = country
+ if not container_id and container_name:
+ extra['container_name'] = container_name
+ if release_year and release_month:
+ # TODO: schema migration
+ extra['release_month'] = release_month
+
+ if doaj_extra:
+ extra['doaj'] = doaj_extra
+ if not extra:
+ extra = None
+
+ re = fatcat_openapi_client.ReleaseEntity(
+ work_id=None,
+ container_id=container_id,
+ release_type='article-journal',
+ release_stage='published',
+ title=title,
+ release_year=release_year,
+ #release_date,
+ publisher=publisher,
+ ext_ids=ext_ids,
+ contribs=contribs,
+ volume=volume,
+ issue=issue,
+ pages=pages,
+ language=language,
+ abstracts=abstracts,
+ extra=extra,
+ license_slug=license_slug,
+ )
+ re = self.biblio_hacks(re)
+ return re
+
+ @staticmethod
+ def biblio_hacks(re):
+ """
+ This function handles known special cases. For example,
+ publisher-specific or platform-specific workarounds.
+ """
+ return re
+
+ def try_update(self, re):
+
+ # lookup existing release by DOAJ article id
+ existing = None
+ try:
+ existing = self.api.lookup_release(doaj=re.ext_ids.doaj)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ # then try other ext_id lookups
+ if not existing:
+ for extid_type in ('doi', 'pmid', 'pmcid'):
+ extid_val = getattr(re.ext_ids, extid_type)
+ if not extid_val:
+ continue
+ #print(f" lookup release type: {extid_type} val: {extid_val}")
+ try:
+ existing = self.api.lookup_release(**{extid_type: extid_val})
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ if existing:
+ if existing.ext_ids.doaj:
+ warn_str = f"unexpected DOAJ ext_id match after lookup failed doaj={re.ext_ids.doaj} ident={existing.ident}"
+ warnings.warn(warn_str)
+ self.counts["skip-doaj-id-mismatch"] += 1
+ return False
+ break
+
+ # TODO: in the future could do fuzzy match here, eg using elasticsearch
+
+ # create entity
+ if not existing:
+ return True
+
+ # other logic could go here about skipping updates
+ if not self.do_updates or existing.ext_ids.doaj:
+ self.counts['exists'] += 1
+ return False
+
+ # fields to copy over for update
+ existing.ext_ids.doaj = existing.ext_ids.doaj or re.ext_ids.doaj
+ existing.release_type = existing.release_type or re.release_type
+ existing.release_stage = existing.release_stage or re.release_stage
+ existing.container_id = existing.container_id or re.container_id
+ existing.abstracts = existing.abstracts or re.abstracts
+ existing.extra['doaj'] = re.extra['doaj']
+ existing.volume = existing.volume or re.volume
+ existing.issue = existing.issue or re.issue
+ existing.pages = existing.pages or re.pages
+ existing.language = existing.language or re.language
+
+ try:
+ self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+ self.counts['update'] += 1
+ except fatcat_openapi_client.rest.ApiException as err:
+ # there is a code path where we try to update the same release
+ # twice in a row; if that happens, just skip
+ # NOTE: API behavior might change in the future?
+ if "release_edit_editgroup_id_ident_id_key" in err.body:
+ self.counts['skip-update-conflict'] += 1
+ return False
+ else:
+ raise err
+
+ return False
+
+ def insert_batch(self, batch):
+ self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
+
+ def doaj_abstracts(self, bibjson: dict) -> List[fatcat_openapi_client.ReleaseAbstract]:
+ text = clean_str(bibjson.get('abstract'))
+ if not text or len(text) < 10:
+ return []
+ if len(text) > MAX_ABSTRACT_LENGTH:
+ text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
+
+ lang = detect_text_lang(text)
+
+ abstract = fatcat_openapi_client.ReleaseAbstract(
+ mimetype="text/plain",
+ content=text,
+ lang=lang,
+ )
+
+ return [abstract,]
+
+ def doaj_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
+ """
+ bibjson.author {
+ affiliation (string, optional),
+ name (string),
+ orcid_id (string, optional)
+ }
+ """
+ contribs = []
+ index = 0
+ for author in authors:
+ if not author.get('name'):
+ continue
+ creator_id = None
+ orcid = clean_orcid(author.get('orcid_id'))
+ if orcid:
+ creator_id = self.lookup_orcid(orcid)
+ contribs.append(fatcat_openapi_client.ReleaseContrib(
+ raw_name=author.get('name'),
+ role='author',
+ index=index,
+ creator_id=creator_id,
+ raw_affiliation=clean_str(author.get('affiliation')),
+ ))
+ index += 1
+ return contribs
+
+ def doaj_ext_ids(self, identifiers: List[dict], doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds:
+ """
+ bibjson.identifier {
+ id (string),
+ type (string)
+ }
+ """
+
+ assert doaj_article_id.isalnum() and len(doaj_article_id) == 32
+
+ doi: Optional[str] = None
+ pmid: Optional[str] = None
+ pmcid: Optional[str] = None
+ for id_obj in identifiers:
+ if not id_obj.get('id'):
+ continue
+ if id_obj['type'].lower() == 'doi':
+ doi = clean_doi(id_obj['id'])
+ elif id_obj['type'].lower() == 'pmid':
+ pmid = clean_pmid(id_obj['id'])
+ elif id_obj['type'].lower() == 'pmcid':
+ pmcid = clean_pmcid(id_obj['id'])
+
+ return fatcat_openapi_client.ReleaseExtIds(
+ doaj=doaj_article_id,
+ doi=doi,
+ pmid=pmid,
+ pmcid=pmcid,
+ )
+
+ def doaj_license_slug(self, license_list: List[dict]) -> Optional[str]:
+ """
+ bibjson.journal.license {
+ open_access (boolean, optional),
+ title (string, optional),
+ type (string, optional),
+ url (string, optional),
+ version (string, optional)
+ }
+ """
+ if not license_list:
+ return None
+ for license in license_list:
+ if not license.get('open_access'):
+ continue
+ slug = license.get('type')
+ if slug.startswith('CC '):
+ slug = slug.replace('CC ', 'cc-').lower()
+ return slug
+ return None
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 4b1d3702..1e04e712 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -1,4 +1,6 @@
+import datetime
+
import fatcat_openapi_client
from .common import EntityImporter, make_rel_url
@@ -20,10 +22,10 @@ class IngestFileResultImporter(EntityImporter):
assert self.default_link_rel
self.require_grobid = require_grobid
if self.require_grobid:
- print("Requiring GROBID status == 200")
+ print("Requiring GROBID status == 200 (for PDFs)")
else:
print("NOT checking GROBID success")
- self.ingest_request_source_whitelist = [
+ self.ingest_request_source_allowlist = [
'fatcat-changelog',
'fatcat-ingest-container',
'fatcat-ingest',
@@ -35,23 +37,41 @@ class IngestFileResultImporter(EntityImporter):
's2-corpus',
's2',
]
- if kwargs.get('skip_source_whitelist', False):
- self.ingest_request_source_whitelist = []
+ if kwargs.get('skip_source_allowlist', False):
+ self.ingest_request_source_allowlist = []
- def want(self, row):
+ def want_file(self, row) -> bool:
+ """
+ File-specific part of want(). Generic across general ingest and save-paper-now.
"""
- Logic here probably needs work (TODO):
- - Direct ingests via DOI from fatcat-changelog should probably go
- through regardless of GROBID status
- - We should filter/block things like single-page PDFs here
- - public/anonymous submissions could require successful biblio-glutton
- match, or some other sanity check on the fatcat side (eg, fuzzy title
- match)
- - handle the case of release_stage not being 'published'; if pre-print,
- potentially create a new release.
+ if not row.get('file_meta'):
+ self.counts['skip-file-meta'] += 1
+ return False
- The current logic is intentionally conservative as a first step.
+ # type-specific filters
+ if row['request'].get('ingest_type') == 'pdf':
+ if self.require_grobid and row.get('grobid', {}).get('status_code') != 200:
+ self.counts['skip-grobid'] += 1
+ return False
+ if row['file_meta'].get('mimetype') not in ("application/pdf",):
+ self.counts['skip-mimetype'] += 1
+ return False
+ elif row['request'].get('ingest_type') == 'xml':
+ if row['file_meta'].get('mimetype') not in ("application/xml",
+ "application/jats+xml", "application/tei+xml", "text/xml"):
+ self.counts['skip-mimetype'] += 1
+ return False
+ else:
+ self.counts['skip-ingest-type'] += 1
+ return False
+
+ return True
+
+ def want_ingest(self, row) -> bool:
+ """
+ Sandcrawler ingest-specific part of want(). Generic across file and
+ webcapture ingest.
"""
if row.get('hit') != True:
self.counts['skip-hit'] += 1
@@ -60,33 +80,48 @@ class IngestFileResultImporter(EntityImporter):
if not source:
self.counts['skip-ingest_request_source'] += 1
return False
- if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist:
+ if self.ingest_request_source_allowlist and source not in self.ingest_request_source_allowlist:
self.counts['skip-ingest_request_source'] += 1
return False
- if source.startswith('arabesque'):
- if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2'):
- self.counts['skip-arabesque-source'] += 1
- return False
+
+ if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2'):
+ self.counts['skip-link-source'] += 1
+ return False
+
if source.startswith('savepapernow'):
# never process async savepapernow requests
self.counts['skip-savepapernow'] += 1
return False
- if not row.get('file_meta'):
- self.counts['skip-file-meta'] += 1
+
+ return True
+
+ def want(self, row):
+ """
+ Overall logic here probably needs work (TODO):
+
+ - Direct ingests via DOI from fatcat-changelog should probably go
+ through regardless of GROBID status
+ - We should filter/block things like single-page PDFs here
+ - public/anonymous submissions could require successful biblio-glutton
+ match, or some other sanity check on the fatcat side (eg, fuzzy title
+ match)
+ - handle the case of release_stage not being 'published'; if pre-print,
+ potentially create a new release.
+
+ The current logic is intentionally conservative as a first step.
+ """
+ if not self.want_file(row):
return False
- if self.require_grobid and row.get('grobid', {}).get('status_code') != 200:
- self.counts['skip-grobid'] += 1
+ if not self.want_ingest(row):
return False
return True
- def parse_record(self, row):
+ def parse_ingest_release_ident(self, row):
request = row['request']
fatcat = request.get('fatcat')
- file_meta = row['file_meta']
- # identify release by fatcat ident, or extid lookup, or biblio-glutton match
release_ident = None
if fatcat and fatcat.get('release_ident'):
release_ident = fatcat.get('release_ident')
@@ -112,23 +147,21 @@ class IngestFileResultImporter(EntityImporter):
return None
release_ident = release.ident
break
+
if self.use_glutton_match and not release_ident and row.get('grobid'):
# try biblio-glutton extracted hit
if row['grobid'].get('fatcat_release'):
release_ident = row['grobid']['fatcat_release'].split('_')[-1]
self.counts['glutton-match'] += 1
- if not release_ident:
- self.counts['skip-release-not-found'] += 1
- return None
+ return release_ident
+ def parse_terminal(self, row):
terminal = row.get('terminal')
if not terminal:
# support old cdx-only ingest results
cdx = row.get('cdx')
if not cdx:
- # TODO: support archive.org hits?
- self.counts['skip-no-terminal'] += 1
return None
else:
terminal = {
@@ -142,7 +175,15 @@ class IngestFileResultImporter(EntityImporter):
terminal['terminal_url'] = terminal['url']
if not 'terminal_dt' in terminal:
terminal['terminal_dt'] = terminal['dt']
+
+ # convert CDX-style digits to ISO-style timestamp
assert len(terminal['terminal_dt']) == 14
+ terminal['terminal_timestamp'] = datetime.datetime.strptime(terminal['terminal_dt'], "%Y%m%d%H%M%S").isoformat() + "Z"
+ return terminal
+
+ def parse_urls(self, row, terminal):
+
+ request = row['request']
default_rel = self.default_link_rel
if request.get('link_source') == 'doi':
@@ -159,6 +200,55 @@ class IngestFileResultImporter(EntityImporter):
urls = [url, ("webarchive", wayback)]
urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
+ return urls
+
+ def parse_edit_extra(self, row):
+
+ request = row['request']
+ edit_extra = dict()
+
+ if request.get('edit_extra'):
+ edit_extra = request['edit_extra']
+
+ if request.get('ingest_request_source'):
+ edit_extra['ingest_request_source'] = request['ingest_request_source']
+ if request.get('link_source') and request.get('link_source_id'):
+ edit_extra['link_source'] = request['link_source']
+ edit_extra['link_source_id'] = request['link_source_id']
+
+ return edit_extra
+
+ def parse_record(self, row):
+
+ request = row['request']
+ file_meta = row['file_meta']
+
+ # double check that want() filtered request correctly (eg, old requests)
+ if request.get('ingest_type') not in ('pdf', 'xml'):
+ self.counts['skip-ingest-type'] += 1
+ return None
+ assert (request['ingest_type'], file_meta['mimetype']) in [
+ ("pdf", "application/pdf"),
+ ("xml", "application/xml"),
+ ("xml", "application/jats+xml"),
+ ("xml", "application/tei+xml"),
+ ("xml", "text/xml"),
+ ]
+
+ # identify release by fatcat ident, or extid lookup, or biblio-glutton match
+ release_ident = self.parse_ingest_release_ident(row)
+
+ if not release_ident:
+ self.counts['skip-release-not-found'] += 1
+ return None
+
+ terminal = self.parse_terminal(row)
+ if not terminal:
+ # TODO: support archive.org hits?
+ self.counts['skip-no-terminal'] += 1
+ return None
+
+ urls = self.parse_urls(row, terminal)
fe = fatcat_openapi_client.FileEntity(
md5=file_meta['md5hex'],
@@ -169,17 +259,10 @@ class IngestFileResultImporter(EntityImporter):
release_ids=[release_ident],
urls=urls,
)
- if request.get('edit_extra'):
- fe.edit_extra = request['edit_extra']
- else:
- fe.edit_extra = dict()
- if request.get('ingest_request_source'):
- fe.edit_extra['ingest_request_source'] = request['ingest_request_source']
- if request.get('link_source') and request.get('link_source_id'):
- fe.edit_extra['link_source'] = request['link_source']
- fe.edit_extra['link_source_id'] = request['link_source_id']
- if not fe.edit_extra:
- fe.edit_extra = None
+
+ edit_extra = self.parse_edit_extra(row)
+ if edit_extra:
+ fe.edit_extra = edit_extra
return fe
def try_update(self, fe):
@@ -244,6 +327,9 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
def want(self, row):
+ if not self.want_file(row):
+ return False
+
source = row['request'].get('ingest_request_source')
if not source:
self.counts['skip-ingest_request_source'] += 1
@@ -254,12 +340,6 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
if row.get('hit') != True:
self.counts['skip-hit'] += 1
return False
- if not row.get('file_meta'):
- self.counts['skip-file-meta'] += 1
- return False
- if self.require_grobid and row.get('grobid', {}).get('status_code') != 200:
- self.counts['skip-grobid'] += 1
- return False
return True
@@ -280,3 +360,154 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
description=self.editgroup_description,
extra=self.editgroup_extra),
entity_list=batch))
+
+
+class IngestWebResultImporter(IngestFileResultImporter):
+ """
+ Variant of IngestFileResultImporter for processing HTML ingest requests
+ into webcapture objects.
+ """
+
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled from web using sandcrawler ingest tool"
+ eg_extra = kwargs.pop('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestWebResultImporter')
+ kwargs['do_updates'] = False
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+
+ def want(self, row):
+
+ if not self.want_ingest(row):
+ return False
+
+ if not row.get('file_meta'):
+ self.counts['skip-file-meta'] += 1
+ return False
+
+ # webcapture-specific filters
+ if row['request'].get('ingest_type') != 'html':
+ self.counts['skip-ingest-type'] += 1
+ return False
+ if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"):
+ self.counts['skip-mimetype'] += 1
+ return False
+
+ return True
+
+ def parse_record(self, row):
+
+ request = row['request']
+ file_meta = row['file_meta']
+
+ # double check that want() filtered request correctly (eg, old requests)
+ if request.get('ingest_type') != "html":
+ self.counts['skip-ingest-type'] += 1
+ return None
+ if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"):
+ self.counts['skip-mimetype'] += 1
+ return None
+
+ # identify release by fatcat ident, or extid lookup
+ release_ident = self.parse_ingest_release_ident(row)
+
+ if not release_ident:
+ self.counts['skip-release-not-found'] += 1
+ return None
+
+ terminal = self.parse_terminal(row)
+ if not terminal:
+ # TODO: support archive.org hits?
+ self.counts['skip-no-terminal'] += 1
+ return None
+
+ urls = self.parse_urls(row, terminal)
+ archive_urls = [u for u in urls if u.rel == 'webarchive']
+
+ if terminal['terminal_status_code'] != 200:
+ self.counts['skip-terminal-status-code'] += 1
+ return None
+
+ terminal_cdx = row['cdx']
+ if 'revisit_cdx' in row:
+ terminal_cdx = row['revisit_cdx']
+ assert terminal_cdx['surt']
+ assert terminal_cdx['url'] == terminal['terminal_url']
+
+ wc_cdx = []
+ # primary resource first
+ wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine(
+ surt=terminal_cdx['surt'],
+ timestamp=terminal['terminal_timestamp'],
+ url=terminal['terminal_url'],
+ mimetype=file_meta['mimetype'],
+ status_code=terminal['terminal_status_code'],
+ sha1=file_meta['sha1hex'],
+ sha256=file_meta['sha256hex'],
+ size=file_meta['size_bytes'],
+ ))
+
+ for resource in row.get('html_resources', []):
+ timestamp = resource['timestamp']
+ if not "+" in timestamp and not "Z" in timestamp:
+ timestamp += "Z"
+ wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine(
+ surt=resource['surt'],
+ timestamp=timestamp,
+ url=resource['url'],
+ mimetype=resource.get('mimetype'),
+ size=resource.get('size'),
+ sha1=resource.get('sha1hex'),
+ sha256=resource.get('sha256hex'),
+ ))
+
+ wc = fatcat_openapi_client.WebcaptureEntity(
+ cdx=wc_cdx,
+ archive_urls=archive_urls,
+ original_url=terminal['terminal_url'],
+ timestamp=terminal['terminal_timestamp'],
+ release_ids=[release_ident],
+ )
+
+ edit_extra = self.parse_edit_extra(row)
+
+ if edit_extra:
+ wc.edit_extra = edit_extra
+ return wc
+
+ def try_update(self, wc):
+
+ # check for existing edits-in-progress with same URL
+ for other in self._entity_queue:
+ if other.original_url == wc.original_url:
+ self.counts['skip-in-queue'] += 1
+ return False
+
+ # lookup sha1, or create new entity (TODO: API doesn't support this yet)
+ #existing = None
+
+ # TODO: currently only allow one release per webcapture
+ release = self.api.get_release(wc.release_ids[0], expand="webcaptures")
+ if release.webcaptures:
+ # check if this is an existing match, or just a similar hit
+ for other in release.webcaptures:
+ if wc.original_url == other.original_url:
+ # TODO: compare very similar timestamps of same time (different formats)
+ self.counts['exists'] += 1
+ return False
+ self.counts['skip-release-has-webcapture'] += 1
+ return False
+
+ # Ok, if we got here then no existing web capture for (first) release,
+ # so go ahead and insert!
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 10a90dba..9ee641fa 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -5,6 +5,13 @@ free-form input, titles, etc.
"""
import re
+import base64
+from typing import Optional
+import unicodedata
+
+import ftfy
+import langdetect
+import pycountry
DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
@@ -55,7 +62,8 @@ def clean_doi(raw):
# will likely want to expand DOI_REGEX to exclude non-ASCII characters, but
# for now block specific characters so we can get PubMed importer running
# again.
- if 'ä' in raw or '\u200e' in raw:
+ # known characters to skip: ä á \u200e \u2043 \u2012
+ if not raw.isascii():
return None
return raw
@@ -72,6 +80,10 @@ def test_clean_doi():
assert clean_doi("10.4149/gpb¬_2017042") == None # "logical negation" character
assert clean_doi("10.6002/ect.2020.häyry") == None # this example via pubmed (pmid:32519616)
assert clean_doi("10.30466/vrf.2019.98547.2350\u200e") == None
+ assert clean_doi("10.12016/j.issn.2096⁃1456.2017.06.014") == None
+ assert clean_doi("10.4025/diálogos.v17i2.36030") == None
+ assert clean_doi("10.19027/jai.10.106‒115") == None
+ assert clean_doi("10.15673/атбп2312-3125.17/2014.26332") == None
ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$")
@@ -233,3 +245,322 @@ def test_clean_orcid():
assert clean_orcid("0123-4567-3456-6789 ") == "0123-4567-3456-6789"
assert clean_orcid("01234567-3456-6780") == None
assert clean_orcid("0x23-4567-3456-6780") == None
+
+
+def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]:
+ """
+ This function is appropriate to be called on any random, non-markup string,
+ such as author names, titles, etc.
+
+ It will try to clean up common unicode mangles, HTML characters, etc.
+
+ This will detect XML/HTML and "do the right thing" (aka, not remove
+ entities like '&amp' if there are tags in the string), unless you pass the
+ 'force_xml' parameter, which might be appropriate for, eg, names and
+ titles, which generally should be projected down to plain text.
+
+ Also strips extra whitespace.
+ """
+ if not thing:
+ return None
+ fix_entities = 'auto'
+ if force_xml:
+ fix_entities = True
+ fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip()
+ if not fixed or len(fixed) <= 1:
+ # wasn't zero-length before, but is now; return None
+ return None
+ return fixed
+
+def test_clean_str():
+
+ assert clean_str(None) == None
+ assert clean_str('') == None
+ assert clean_str('1') == None
+ assert clean_str('123') == '123'
+ assert clean_str('a&amp;b') == 'a&b'
+ assert clean_str('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
+ assert clean_str('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+
+def b32_hex(s):
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ return s
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
+def is_cjk(s):
+ if not s:
+ return False
+ for c in s:
+ if c.isalpha():
+ lang_prefix = unicodedata.name(c).split()[0]
+ return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL')
+ return False
+
+def test_is_cjk():
+ assert is_cjk(None) is False
+ assert is_cjk('') is False
+ assert is_cjk('blah') is False
+ assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True
+ assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True
+ assert is_cjk('菊') is True
+ assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True
+ assert is_cjk('水道') is True
+ assert is_cjk('オウ, イク') is True # kanji
+ assert is_cjk('ひヒ') is True
+ assert is_cjk('き゚ゅ') is True
+ assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True
+
+MONTH_MAP = {
+ "jan": 1, "january": 1,
+ "feb": 2, "febuary": 2,
+ "mar": 3, "march": 3,
+ "apr": 4, "april": 4,
+ "may": 5, "may": 5,
+ "jun": 6, "june": 6,
+ "jul": 7, "july": 7,
+ "aug": 8, "august": 8,
+ "sep": 9, "september": 9,
+ "oct": 10, "october": 10,
+ "nov": 11, "nov": 11,
+ "dec": 12, "december": 12,
+}
+
+def parse_month(raw: Optional[str]) -> Optional[int]:
+ """
+ Parses a string into a month number (1 to 12)
+ """
+ if not raw:
+ return None
+ raw = raw.strip().lower()
+ if raw.isdigit():
+ raw_int = int(raw)
+ if raw_int >= 1 and raw_int <= 12:
+ return raw_int
+ else:
+ return None
+ if raw in MONTH_MAP:
+ return MONTH_MAP[raw]
+ return None
+
+def test_parse_month() -> None:
+
+ assert parse_month(None) == None
+ assert parse_month("") == None
+ assert parse_month("0") == None
+ assert parse_month("10") == 10
+ assert parse_month("jan") == 1
+ assert parse_month("September") == 9
+
+def detect_text_lang(raw: str) -> Optional[str]:
+ """
+ Tries to determine language of, eg, an abstract.
+
+ Returns an ISO 631 2-char language code, or None.
+ """
+ if not raw:
+ return None
+ try:
+ lang = langdetect.detect(raw)
+ lang = lang.split('-')[0]
+ assert len(lang) == 2
+ return lang
+ except (langdetect.lang_detect_exception.LangDetectException, TypeError):
+ return None
+ return None
+
+def test_detect_text_lang() -> None:
+ assert detect_text_lang("") == None
+ EN_SAMPLE = "this is a string of English text for testing"
+ assert detect_text_lang(EN_SAMPLE) == "en"
+ JA_SAMPLE = "モーラの種類は、以下に示すように111程度存在する。ただし、研究者により数え方が少しずつ異なる。"
+ assert detect_text_lang(JA_SAMPLE) == "ja"
+ ZH_SAMPLE = "随着分布式清洁能源的普及,通信技术在协调各个分布式电源的控制中显得尤为重要。在电力信息传输的过程中,不同的网络状态下表现出不同的通信特性,严重的甚至会发生信息错乱丢包等行为,这对电网的实时控制产生严重影响。为研究信息系统对电力物理系统的实时影响,搭建了电力信息物理融合仿真平台,运用RT-LAB与OPNET两款实时仿真器,通过TCP/IP进行数据交互,对微电网电压、频率的集中式恢复与分布式恢复问题展开研究。仿真结果表明,该平台能有效地反映通信网络对电网控制的影响,提供了一种可靠的未来电力信息物理融合系统研究技术。随着分布式清洁能源的普及,通信技术在协调各个分布式电源的控制中显得尤为重要。在电力信息传输的过程中,不同的网络状态下表现出不同的通信特性,严重的甚至会发生信息错乱丢包等行为,这对电网的实时控制产生严重影响。为研究信息系统对电力物理系统的实时影响,搭建了电力信息物理融合仿真平台,运用RT-LAB与OPNET两款实时仿真器,通过TCP/IP进行数据交互,对微电网电压、频率的集中式恢复与分布式恢复问题展开研究。仿真结果表明,该平台能有效地反映通信网络对电网控制的影响,提供了一种可靠的未来电力信息物理融合系统研究技术。"
+ # XXX: why does this detect as `ko` sometimes?
+ assert detect_text_lang(ZH_SAMPLE) in ("zh", "ko")
+
+def parse_lang_name(raw: Optional[str]) -> Optional[str]:
+ """
+ Parses a language name and returns a 2-char ISO 631 language code.
+ """
+ if not raw:
+ return None
+ try:
+ lang = pycountry.languages.lookup(raw)
+ if lang.alpha_3 in ("mul", "mis"):
+ return None
+ return lang.alpha_2.lower()
+ except LookupError:
+ #print(f" unknown language: '{raw}', file=sys.stderr)
+ return None
+ except AttributeError:
+ #print(f" partial language metadata: '{lang}', file=sys.stderr)
+ return None
+ return None
+
+def test_parse_lang_name() -> None:
+
+ assert parse_lang_name(None) == None
+ assert parse_lang_name("") == None
+ assert parse_lang_name("asdf ") == None
+ assert parse_lang_name("english") == "en"
+ assert parse_lang_name("ENGLISH") == "en"
+ assert parse_lang_name("asdf blah") is None
+ assert parse_lang_name("en") == "en"
+ assert parse_lang_name("EN") == "en"
+ assert parse_lang_name("ENG") == "en"
+ assert parse_lang_name("English") == "en"
+ assert parse_lang_name("Portuguese") == "pt"
+
+
+def parse_country_name(s: Optional[str]) -> Optional[str]:
+ """
+ Parses a country name into a ISO country code (2-char).
+
+ This version copied from the chocula repository.
+ """
+ if not s or s in ("Unknown"):
+ return None
+
+ s = s.strip()
+ if s.lower() in ("usa", "new york (state)", "washington (state)"):
+ return "us"
+ if s.lower() in ("russia (federation)", "russia"):
+ return "ru"
+ if s == "Québec (Province)":
+ s = "Canada"
+ if s == "China (Republic : 1949- )":
+ return "tw"
+ if s == "Brunei":
+ return "bn"
+ if s.startswith("Congo "):
+ s = "Congo"
+ if s.lower() == "iran":
+ return "ir"
+ if s.lower() == "bermuda islands":
+ return "bm"
+ if s.lower() == "burma":
+ s = "myanmar"
+ if s.lower() in ("korea (south)", "south korea"):
+ return "kr"
+ if s.lower() in ("england", "scotland", "wales"):
+ return "uk"
+ s = s.replace(" (Republic)", "").replace(" (Federation)", "")
+
+ try:
+ country = pycountry.countries.lookup(s)
+ except LookupError:
+ country = None
+
+ if country:
+ return country.alpha_2.lower()
+ try:
+ sub = pycountry.subdivisions.lookup(s)
+ except LookupError:
+ sub = None
+
+ s = s.replace(" (State)", "").replace(" (Province)", "")
+ if sub:
+ return sub.country_code.lower()
+
+ else:
+ # print(f"unknown country: {s}", file=sys.stderr)
+ return None
+
+
+def test_parse_country_name():
+ assert parse_country_name("") is None
+ assert parse_country_name("asdf blah") is None
+ assert parse_country_name("us") == "us"
+ assert parse_country_name("USA") == "us"
+ assert parse_country_name("United States of America") == "us"
+ assert parse_country_name("united States") == "us"
+ assert parse_country_name("Massachusetts") == "us"
+ assert parse_country_name("Russia") == "ru"
+ assert parse_country_name("Japan") == "jp"
+
+# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
+# 2/T and 2/B?
+# PubMed/MEDLINE and JSTOR use these MARC codes
+# https://www.loc.gov/marc/languages/language_name.html
+LANG_MAP_MARC = {
+ 'afr': 'af',
+ 'alb': 'sq',
+ 'amh': 'am',
+ 'ara': 'ar',
+ 'arm': 'hy',
+ 'aze': 'az',
+ 'ben': 'bn',
+ 'bos': 'bs',
+ 'bul': 'bg',
+ 'cat': 'ca',
+ 'chi': 'zh',
+ 'cze': 'cs',
+ 'dan': 'da',
+ 'dut': 'nl',
+ 'eng': 'en',
+ 'epo': 'eo',
+ 'est': 'et',
+ 'fin': 'fi',
+ 'fre': 'fr',
+ 'geo': 'ka',
+ 'ger': 'de',
+ 'gla': 'gd',
+ 'gre': 'el',
+ 'heb': 'he',
+ 'hin': 'hi',
+ 'hrv': 'hr',
+ 'hun': 'hu',
+ 'ice': 'is',
+ 'ind': 'id',
+ 'ita': 'it',
+ 'jpn': 'ja',
+ 'kin': 'rw',
+ 'kor': 'ko',
+ 'lat': 'la',
+ 'lav': 'lv',
+ 'lit': 'lt',
+ 'mac': 'mk',
+ 'mal': 'ml',
+ 'mao': 'mi',
+ 'may': 'ms',
+ 'nor': 'no',
+ 'per': 'fa',
+ 'per': 'fa',
+ 'pol': 'pl',
+ 'por': 'pt',
+ 'pus': 'ps',
+ 'rum': 'ro',
+ 'rus': 'ru',
+ 'san': 'sa',
+ 'slo': 'sk',
+ 'slv': 'sl',
+ 'spa': 'es',
+ 'srp': 'sr',
+ 'swe': 'sv',
+ 'tha': 'th',
+ 'tur': 'tr',
+ 'ukr': 'uk',
+ 'urd': 'ur',
+ 'vie': 'vi',
+ 'wel': 'cy',
+
+# additions
+ 'gle': 'ga', # "Irish" (Gaelic)
+ 'jav': 'jv', # Javanese
+ 'welsh': 'cy', # Welsh
+ 'oci': 'oc', # Occitan
+
+# Don't have ISO 639-1 codes
+ 'grc': 'el', # Ancient Greek; map to modern greek
+ 'map': None, # Austronesian (collection)
+ 'syr': None, # Syriac, Modern
+ 'gem': None, # Old Saxon
+ 'non': None, # Old Norse
+ 'emg': None, # Eastern Meohang
+ 'neg': None, # Negidal
+ 'mul': None, # Multiple languages
+ 'und': None, # Undetermined
+}
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index dfb5f799..ad4b7722 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -1,10 +1,13 @@
import datetime
+from typing import Optional
import tldextract
+from fatcat_openapi_client import ReleaseEntity, ContainerEntity
-def check_kbart(year, archive):
+
+def check_kbart(year: int, archive: dict) -> Optional[bool]:
if not archive or not archive.get('year_spans'):
return None
for span in archive['year_spans']:
@@ -12,7 +15,7 @@ def check_kbart(year, archive):
return True
return False
-def test_check_kbart():
+def test_check_kbart() -> None:
assert check_kbart(1990, dict()) is None
assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) is False
@@ -21,10 +24,13 @@ def test_check_kbart():
assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) is True
-def release_to_elasticsearch(entity, force_bool=True):
+def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) -> dict:
"""
Converts from an entity model/schema to elasticsearch oriented schema.
+ This is a large/complex transform, so subsets are split out into helper
+ functions.
+
Returns: dict
Raises exception on error (never returns None)
"""
@@ -68,16 +74,18 @@ def release_to_elasticsearch(entity, force_bool=True):
mag_id = release.ext_ids.mag,
)
- is_oa = None
- is_preserved = None
- is_longtail_oa = None
- in_kbart = None
- in_jstor = False
- in_web = False
- in_dweb = False
- in_ia = False
- in_ia_sim = False
- in_shadows = False
+ t.update(dict(
+ is_oa = None,
+ is_longtail_oa = None,
+ is_preserved = None,
+ in_web = False,
+ in_dweb = False,
+ in_ia = False,
+ in_ia_sim = False,
+ in_kbart = None,
+ in_jstor = False,
+ in_shadows = False,
+ ))
release_year = release.release_year
if release.release_date:
@@ -116,55 +124,8 @@ def release_to_elasticsearch(entity, force_bool=True):
# TODO: mapping... probably by lookup?
t['affiliation_rors'] = None
- this_year = datetime.date.today().year
- container = release.container
- if container:
- t['publisher'] = container.publisher
- t['container_name'] = container.name
- # this is container.ident, not release.container_id, because there may
- # be a redirect involved
- t['container_id'] = container.ident
- t['container_issnl'] = container.issnl
- t['container_type'] = container.container_type
- if container.extra:
- c_extra = container.extra
- if c_extra.get('kbart') and release_year:
- in_jstor = check_kbart(release_year, c_extra['kbart'].get('jstor'))
- in_kbart = in_jstor
- for archive in ('portico', 'lockss', 'clockss', 'pkp_pln',
- 'hathitrust', 'scholarsportal', 'cariniana'):
- in_kbart = in_kbart or check_kbart(release_year, c_extra['kbart'].get(archive))
- # recent KBART coverage is often not updated for the
- # current year. So for current-year publications, consider
- # coverage from *last* year to also be included in the
- # Keeper
- if not in_kbart and release_year == this_year:
- in_kbart = check_kbart(this_year - 1, c_extra['kbart'].get(archive))
-
- if c_extra.get('ia'):
- if c_extra['ia'].get('sim') and release_year:
- in_ia_sim = check_kbart(release_year, c_extra['ia']['sim'])
- if c_extra['ia'].get('longtail_oa'):
- is_longtail_oa = True
- if c_extra.get('sherpa_romeo'):
- if c_extra['sherpa_romeo'].get('color') == 'white':
- is_oa = False
- if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'):
- is_oa = True
- if c_extra.get('doaj'):
- if c_extra['doaj'].get('as_of'):
- is_oa = True
- if c_extra.get('road'):
- if c_extra['road'].get('as_of'):
- is_oa = True
- if c_extra.get('szczepanski'):
- if c_extra['szczepanski'].get('as_of'):
- is_oa = True
- if c_extra.get('country'):
- t['country_code'] = c_extra['country']
- t['country_code_upper'] = c_extra['country'].upper()
- if c_extra.get('publisher_type'):
- t['publisher_type'] = c_extra['publisher_type']
+ if release.container:
+ t.update(_rte_container_helper(release.container, release_year))
# fall back to release-level container metadata if container not linked or
# missing context
@@ -174,67 +135,36 @@ def release_to_elasticsearch(entity, force_bool=True):
t['container_name'] = release.extra.get('container_name')
if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')):
- in_jstor = True
+ t['in_jstor'] = True
- files = release.files or []
- t['file_count'] = len(files)
- t['fileset_count'] = len(release.filesets or [])
- t['webcapture_count'] = len(release.webcaptures or [])
- any_pdf_url = None
- good_pdf_url = None
- best_pdf_url = None
- ia_pdf_url = None
- for f in files:
- if f.extra and f.extra.get('shadows'):
- # TODO: shadow check goes here
- in_shadows = True
- is_pdf = 'pdf' in (f.mimetype or '')
- for release_url in (f.urls or []):
- if not f.mimetype and 'pdf' in release_url.url.lower():
- is_pdf = True
- if release_url.url.lower().startswith('http'):
- in_web = True
- if release_url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
- # not sure what rel will be for this stuff
- in_dweb = True
- if is_pdf:
- any_pdf_url = release_url.url
- if is_pdf and release_url.rel in ('webarchive', 'repository') and is_pdf:
- is_preserved = True
- good_pdf_url = release_url.url
- if '//www.jstor.org/' in release_url.url:
- in_jstor = True
- if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url:
- in_ia = True
- if is_pdf:
- best_pdf_url = release_url.url
- ia_pdf_url = release_url.url
- # here is where we bake-in priority; IA-specific
- t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
- t['ia_pdf_url'] = ia_pdf_url
+ # transform file/fileset/webcapture related fields
+ t.update(_rte_content_helper(release))
+
+ if release.ext_ids.doaj:
+ t['is_oa'] = True
if release.license_slug:
# TODO: more/better checks here, particularly strict *not* OA licenses
if release.license_slug.startswith("CC-"):
- is_oa = True
+ t['is_oa'] = True
if release.license_slug.startswith("ARXIV-"):
- is_oa = True
+ t['is_oa'] = True
extra = release.extra or dict()
if extra:
if extra.get('is_oa'):
# NOTE: not actually setting this anywhere... but could
- is_oa = True
+ t['is_oa'] = True
if extra.get('longtail_oa'):
# sometimes set by GROBID/matcher
- is_oa = True
- is_longtail_oa = True
+ t['is_oa'] = True
+ t['is_longtail_oa'] = True
if not t.get('container_name'):
t['container_name'] = extra.get('container_name')
if extra.get('crossref'):
if extra['crossref'].get('archive'):
# all crossref archives are KBART, I believe
- in_kbart = True
+ t['in_kbart'] = True
# backwards compatible subtitle fetching
if not t['subtitle'] and extra.get('subtitle'):
if type(extra['subtitle']) == list:
@@ -251,7 +181,7 @@ def release_to_elasticsearch(entity, force_bool=True):
# TODO: non-numerical first pages
t['ia_microfilm_url'] = None
- if in_ia_sim:
+ if t['in_ia_sim']:
# TODO: determine URL somehow? I think this is in flux. Will probably
# need extra metadata in the container extra field.
# special case as a demo for now.
@@ -277,42 +207,168 @@ def release_to_elasticsearch(entity, force_bool=True):
if t['doi']:
t['doi_prefix'] = t['doi'].split('/')[0]
- if is_longtail_oa:
- is_oa = True
+ if t['is_longtail_oa']:
+ t['is_oa'] = True
+ # optionally coerce all flags from Optional[bool] to bool
if force_bool:
- t['is_oa'] = bool(is_oa)
- t['is_longtail_oa'] = bool(is_longtail_oa)
- t['in_kbart'] = bool(in_kbart)
- t['in_ia_sim'] = bool(in_ia_sim)
- t['in_jstor'] = bool(in_jstor)
- t['in_web'] = bool(in_web)
- t['in_dweb'] = bool(in_dweb)
- t['in_shadows'] = bool(in_shadows)
- else:
- t['is_oa'] = is_oa
- t['is_longtail_oa'] = is_longtail_oa
- t['in_kbart'] = in_kbart
- t['in_ia_sim'] = in_ia_sim
- t['in_jstor'] = in_jstor
- t['in_web'] = in_web
- t['in_dweb'] = in_dweb
- t['in_shadows'] = in_shadows
-
- t['in_ia'] = bool(in_ia)
- t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id'))
+ for k in ('is_oa', 'is_longtail_oa', 'in_kbart', 'in_ia_sim',
+ 'in_jstor', 'in_web', 'in_dweb', 'in_shadows'):
+ t[k] = bool(t[k])
+
+ t['in_ia'] = bool(t['in_ia'])
+ t['is_preserved'] = (
+ bool(t['is_preserved'])
+ or t['in_ia']
+ or t['in_kbart']
+ or t['in_jstor']
+ or t.get('pmcid')
+ or t.get('arxiv_id')
+ )
- if in_ia:
+ if t['in_ia']:
t['preservation'] = 'bright'
- elif in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id'):
+ elif t['is_preserved']:
t['preservation'] = 'dark'
- elif in_shadows:
+ elif t['in_shadows']:
t['preservation'] = 'shadows_only'
else:
t['preservation'] = 'none'
return t
+def _rte_container_helper(container: ContainerEntity, release_year: Optional[int]) -> dict:
+ """
+ Container metadata sub-section of release_to_elasticsearch()
+ """
+ this_year = datetime.date.today().year
+ t = dict()
+ t['publisher'] = container.publisher
+ t['container_name'] = container.name
+ # this is container.ident, not release.container_id, because there may
+ # be a redirect involved
+ t['container_id'] = container.ident
+ t['container_issnl'] = container.issnl
+ t['container_type'] = container.container_type
+ if container.extra:
+ c_extra = container.extra
+ if c_extra.get('kbart') and release_year:
+ if check_kbart(release_year, c_extra['kbart'].get('jstor')):
+ t['in_jstor'] = True
+ if t.get('in_kbart') or t.get('in_jstor'):
+ t['in_kbart'] = True
+ for archive in ('portico', 'lockss', 'clockss', 'pkp_pln',
+ 'hathitrust', 'scholarsportal', 'cariniana'):
+ t['in_kbart'] = t.get('in_kbart') or check_kbart(release_year, c_extra['kbart'].get(archive))
+ # recent KBART coverage is often not updated for the
+ # current year. So for current-year publications, consider
+ # coverage from *last* year to also be included in the
+ # Keeper
+ if not t.get('in_kbart') and release_year == this_year:
+ t['in_kbart'] = check_kbart(this_year - 1, c_extra['kbart'].get(archive))
+
+ if c_extra.get('ia'):
+ if c_extra['ia'].get('sim') and release_year:
+ t['in_ia_sim'] = check_kbart(release_year, c_extra['ia']['sim'])
+ if c_extra['ia'].get('longtail_oa'):
+ t['is_longtail_oa'] = True
+ if c_extra.get('sherpa_romeo'):
+ if c_extra['sherpa_romeo'].get('color') == 'white':
+ t['is_oa'] = False
+ if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'):
+ t['is_oa'] = True
+ if c_extra.get('doaj'):
+ if c_extra['doaj'].get('as_of'):
+ t['is_oa'] = True
+ if c_extra.get('road'):
+ if c_extra['road'].get('as_of'):
+ t['is_oa'] = True
+ if c_extra.get('szczepanski'):
+ if c_extra['szczepanski'].get('as_of'):
+ t['is_oa'] = True
+ if c_extra.get('country'):
+ t['country_code'] = c_extra['country']
+ t['country_code_upper'] = c_extra['country'].upper()
+ if c_extra.get('publisher_type'):
+ t['publisher_type'] = c_extra['publisher_type']
+ return t
+
+def _rte_content_helper(release: ReleaseEntity) -> dict:
+ """
+ File/FileSet/WebCapture sub-section of release_to_elasticsearch()
+
+ The current priority order for "best_pdf_url" is:
+ - internet archive urls (archive.org or web.archive.org)
+ - other webarchive or repository URLs
+ - any other URL
+ """
+ t = dict(
+ file_count = len(release.files or []),
+ fileset_count = len(release.filesets or []),
+ webcapture_count = len(release.webcaptures or []),
+ )
+
+ any_pdf_url = None
+ good_pdf_url = None
+ best_pdf_url = None
+ ia_pdf_url = None
+
+ for f in release.files or []:
+ if f.extra and f.extra.get('shadows'):
+ t['in_shadows'] = True
+ is_pdf = 'pdf' in (f.mimetype or '')
+ for release_url in (f.urls or []):
+ # first generic flags
+ t.update(_rte_url_helper(release_url))
+
+ # then PDF specific stuff (for generating "best URL" fields)
+ if not f.mimetype and 'pdf' in release_url.url.lower():
+ is_pdf = True
+ if is_pdf:
+ any_pdf_url = release_url.url
+ if release_url.rel in ('webarchive', 'repository', 'repo'):
+ good_pdf_url = release_url.url
+ if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url:
+ best_pdf_url = release_url.url
+ ia_pdf_url = release_url.url
+
+ # here is where we bake-in PDF url priority; IA-specific
+ t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
+ t['ia_pdf_url'] = ia_pdf_url
+
+ for fs in release.filesets or []:
+ for url_obj in (fs.urls or []):
+ t.update(_rte_url_helper(url_obj))
+
+ for wc in release.webcaptures or []:
+ for url_obj in (wc.archive_urls or []):
+ t.update(_rte_url_helper(url_obj))
+
+ return t
+
+def _rte_url_helper(url_obj) -> dict:
+ """
+ Takes a location URL ('url' and 'rel' keys) and returns generic preservation status.
+
+ Designed to work with file, webcapture, or fileset URLs.
+
+ Returns a dict; should *not* include non-True values for any keys because
+ these will be iteratively update() into the overal object.
+ """
+ t = dict()
+ if url_obj.rel in ('webarchive', 'repository', 'archive', 'repo'):
+ t['is_preserved'] = True
+ if '//web.archive.org/' in url_obj.url or '//archive.org/' in url_obj.url:
+ t['in_ia'] = True
+ if url_obj.url.lower().startswith('http') or url_obj.url.lower().startswith('ftp'):
+ t['in_web'] = True
+ if url_obj.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
+ # not sure what rel will be for this stuff
+ t['in_dweb'] = True
+ if '//www.jstor.org/' in url_obj.url:
+ t['in_jstor'] = True
+ return t
+
def container_to_elasticsearch(entity, force_bool=True):
"""
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py
index 2f4e2271..59831017 100644
--- a/python/fatcat_tools/transforms/ingest.py
+++ b/python/fatcat_tools/transforms/ingest.py
@@ -15,15 +15,19 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=
if release.state != 'active':
return None
+ # TODO: infer ingest type based on release_type or container metadata?
+ if not ingest_type:
+ ingest_type = 'pdf'
+
# generate a URL where we expect to find fulltext
url = None
link_source = None
link_source_id = None
- if release.ext_ids.arxiv:
+ if release.ext_ids.arxiv and ingest_type == "pdf":
url = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv)
link_source = "arxiv"
link_source_id = release.ext_ids.arxiv
- elif release.ext_ids.pmcid:
+ elif release.ext_ids.pmcid and ingest_type == "pdf":
# TODO: how to tell if an author manuscript in PMC vs. published?
#url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid)
url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid)
@@ -40,10 +44,6 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=
ext_ids = release.ext_ids.to_dict()
ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v])
- # TODO: infer ingest type based on release_type or container metadata?
- if not ingest_type:
- ingest_type = 'pdf'
-
ingest_request = {
'ingest_type': ingest_type,
'ingest_request_source': ingest_request_source,
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index 2111a20d..94791770 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -326,6 +326,8 @@ class EntityUpdatesWorker(FatcatWorker):
release_ids = []
new_release_ids = []
file_ids = []
+ fileset_ids = []
+ webcapture_ids = []
container_ids = []
work_ids = []
release_edits = cle['editgroup']['edits']['releases']
@@ -337,6 +339,12 @@ class EntityUpdatesWorker(FatcatWorker):
file_edits = cle['editgroup']['edits']['files']
for e in file_edits:
file_ids.append(e['ident'])
+ fileset_edits = cle['editgroup']['edits']['filesets']
+ for e in fileset_edits:
+ fileset_ids.append(e['ident'])
+ webcapture_edits = cle['editgroup']['edits']['webcaptures']
+ for e in webcapture_edits:
+ webcapture_ids.append(e['ident'])
container_edits = cle['editgroup']['edits']['containers']
for e in container_edits:
container_ids.append(e['ident'])
@@ -348,8 +356,8 @@ class EntityUpdatesWorker(FatcatWorker):
for ident in set(file_ids):
file_entity = self.api.get_file(ident, expand=None)
# update release when a file changes
- # TODO: fetch old revision as well, and only update
- # releases for which list changed
+ # TODO: also fetch old version of file and update any *removed*
+ # release idents (and same for filesets, webcapture updates)
release_ids.extend(file_entity.release_ids or [])
file_dict = self.api.api_client.sanitize_for_serialization(file_entity)
producer.produce(
@@ -358,6 +366,19 @@ class EntityUpdatesWorker(FatcatWorker):
key=ident.encode('utf-8'),
on_delivery=fail_fast,
)
+
+ # TODO: topic for fileset updates
+ for ident in set(fileset_ids):
+ fileset_entity = self.api.get_fileset(ident, expand=None)
+ # update release when a fileset changes
+ release_ids.extend(file_entity.release_ids or [])
+
+ # TODO: topic for webcapture updates
+ for ident in set(webcapture_ids):
+ webcapture_entity = self.api.get_webcapture(ident, expand=None)
+ # update release when a webcapture changes
+ release_ids.extend(webcapture_entity.release_ids or [])
+
for ident in set(container_ids):
container = self.api.get_container(ident)
container_dict = self.api.api_client.sanitize_for_serialization(container)
@@ -367,6 +388,7 @@ class EntityUpdatesWorker(FatcatWorker):
key=ident.encode('utf-8'),
on_delivery=fail_fast,
)
+
for ident in set(release_ids):
release = self.api.get_release(ident, expand="files,filesets,webcaptures,container")
if release.work_id:
@@ -378,7 +400,7 @@ class EntityUpdatesWorker(FatcatWorker):
key=ident.encode('utf-8'),
on_delivery=fail_fast,
)
- # filter to "new" active releases with no matched files
+ # for ingest requests, filter to "new" active releases with no matched files
if release.ident in new_release_ids:
ir = release_ingest_request(release, ingest_request_source='fatcat-changelog')
if ir and not release.files and self.want_live_ingest(release, ir):
diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html
index 7836ef77..cc9cf5fe 100644
--- a/python/fatcat_web/templates/release_view.html
+++ b/python/fatcat_web/templates/release_view.html
@@ -308,7 +308,7 @@ accessible version.
</div>
{% endif %}
-{% if release.number or release.ext_ids.doi or release.ext_ids.pmid or release.ext_ids.pmcid or release.ext_ids.wikidata_qid or release.ext_ids.isbn13 or release.ext_ids.arxiv or release.ext_ids.jstor or release.ext_ids.ark %}
+{% if release.number or release.ext_ids.doi or release.ext_ids.pmid or release.ext_ids.pmcid or release.ext_ids.wikidata_qid or release.ext_ids.isbn13 or release.ext_ids.arxiv or release.ext_ids.jstor or release.ext_ids.ark or release.ext_ids.doaj or release.ext_ids.dblp %}
<div class="ui segment attached" style="word-wrap: break-word;">
{% if release.number %}
<b>Number</b> &nbsp;<code>{{ release.number }}</code><br>
@@ -337,6 +337,12 @@ accessible version.
{% if release.ext_ids.ark != None %}
<b>ARK</b> &nbsp;<a href="https://n2t.net/{{ release.ext_ids.ark }}"><code>{{ release.ext_ids.ark }}</code></a><br>
{% endif %}
+{% if release.ext_ids.doaj != None %}
+ <b>DOAJ</b> &nbsp;<a href="https://doaj.org/article/{{ release.ext_ids.doaj }}" title="{{ release.ext_ids.doaj }}"><code>{{ release.ext_ids.doaj[:20] }}...</code></a><br>
+{% endif %}
+{% if release.ext_ids.dblp != None %}
+ <b>dblp</b> &nbsp;<a href="https://dblp.org/rec/{{ release.ext_ids.dblp }}.html"><code>{{ release.ext_ids.dblp }}</code></a><br>
+{% endif %}
</div>
{% endif %}
diff --git a/python/tests/files/example_doaj_articles.json b/python/tests/files/example_doaj_articles.json
new file mode 100644
index 00000000..018a4800
--- /dev/null
+++ b/python/tests/files/example_doaj_articles.json
@@ -0,0 +1,5 @@
+{"last_updated":"2020-02-04T14:11:44Z","bibjson":{"identifier":[{"id":"0264-1275","type":"pissn"},{"id":"10.1016/j.matdes.2016.06.110","type":"DOI"}],"journal":{"volume":"108","number":"","country":"GB","license":[{"open_access":true,"title":"CC BY-NC-ND","type":"CC BY-NC-ND","url":"https://www.elsevier.com/journals/materials-and-design/0264-1275/open-access-journal"}],"issns":["0264-1275","1873-4197"],"publisher":"Elsevier","language":["EN"],"title":"Materials & Design"},"month":"10","end_page":"617","year":"2016","start_page":"608","subject":[{"code":"TA401-492","scheme":"LCC","term":"Materials of engineering and construction. Mechanics of materials"}],"author":[{"affiliation":"State Key Laboratory for Mechanical Behavior of Materials, School of Materials Science and Engineering, Xi'an Jiaotong University, Xi'an 710049, China","name":"Xinfeng Li"},{"affiliation":"Department of Geosciences, Center for Materials by Design, State University of New York, Stony Brook, NY 11794-2100, USA","name":"Jin Zhang"},{"affiliation":"School of Chemical Engineering & Technology, China University of Mining and Technology, Xuzhou 221116, China","name":"Yanfei Wang"},{"affiliation":"State Key Laboratory for Mechanical Behavior of Materials, School of Materials Science and Engineering, Xi'an Jiaotong University, Xi'an 710049, China","name":"Sicong Shen"},{"affiliation":"State Key Laboratory for Mechanical Behavior of Materials, School of Materials Science and Engineering, Xi'an Jiaotong University, Xi'an 710049, China; Corresponding author.","name":"Xiaolong Song"}],"link":[{"type":"fulltext","url":"http://www.sciencedirect.com/science/article/pii/S0264127516308723"}],"abstract":"The tensile properties and fracture behavior of PH 13-8 Mo steel after subjected to pre-charged hydrogen were investigated by slow strain rate tensile tests. The results suggest that hydrogen slightly increases yield strength, while decreases tensile strength. The susceptibility to hydrogen embrittlement of specimens aged at 650 °C firstly reduces and then increases as the aging time increases, reaching the lowest value at aging time 4 h. This is dominantly attributed to the highest content of austenite. Moreover, hydrogen-induced crack nucleation sites initiate from lath, packet and prior austenite grain boundaries. Crack propagation passes through lath boundaries and walks along packet, prior austenite grain boundaries. Scanning electron microscopy result indicates that hydrogen-charged specimens show quasi-cleavage fracture and intergranular fracture in annular brittle zone while dimple fracture is observed in hydrogen-free specimens. Keywords: Hydrogen embrittlement, PH 13-8 Mo steel, Aging time, Fracture behavior","title":"Effect of hydrogen on tensile properties and fracture behavior of PH 13-8 Mo steel"},"created_date":"2019-06-05T05:25:15Z","id":"e58f08a11ecb495ead55a44ad4f89808"}
+{"last_updated":"2020-02-04T08:06:42Z","bibjson":{"identifier":[{"id":"2072-6694","type":"eissn"},{"id":"10.3390/cancers9080107","type":"doi"}],"journal":{"volume":"9","number":"8","country":"CH","license":[{"open_access":true,"title":"CC BY","type":"CC BY","url":"http://www.mdpi.com/journal/cancers/about"}],"issns":["2072-6694"],"publisher":"MDPI AG","language":["EN"],"title":"Cancers"},"month":"8","keywords":["ALK rearrangement, lung cancer, biology, immunohistochemistry, FISH, molecular biology."],"year":"2017","start_page":"107","subject":[{"code":"RC254-282","scheme":"LCC","term":"Neoplasms. Tumors. Oncology. Including cancer and carcinogens"}],"author":[{"affiliation":"Laboratory of Clinical and Experimental Pathology, Pasteur Hospital, 30 avenue de la voie romaine, 06001 Nice cedex 01, France","name":"Paul Hofman"}],"link":[{"content_type":"pdf","type":"fulltext","url":"https://www.mdpi.com/2072-6694/9/8/107"}],"abstract":"Patients with advanced-stage non-small cell lung carcinoma (NSCLC) harboring an ALK rearrangement, detected from a tissue sample, can benefit from targeted ALK inhibitor treatment. Several increasingly effective ALK inhibitors are now available for treatment of patients. However, despite an initial favorable response to treatment, in most cases relapse or progression occurs due to resistance mechanisms mainly caused by mutations in the tyrosine kinase domain of ALK. The detection of an ALK rearrangement is pivotal and can be done using different methods, which have variable sensitivity and specificity depending, in particular, on the quality and quantity of the patient’s sample. This review will first highlight briefly some information regarding the pathobiology of an ALK rearrangement and the epidemiology of patients harboring this genomic alteration. The different methods used to detect an ALK rearrangement as well as their advantages and disadvantages will then be examined and algorithms proposed for detection in daily routine practice.","title":"ALK in Non-Small Cell Lung Cancer (NSCLC) Pathobiology, Epidemiology, Detection from Tumor Tissue and Algorithm Diagnosis in a Daily Practice"},"admin":{"seal":true},"created_date":"2018-10-26T07:49:34Z","id":"937c7aa790e048d4ae5f53a2ad71f0dc"}
+{"last_updated":"2020-02-04T13:43:13Z","bibjson":{"identifier":[{"id":"1178-2013","type":"pissn"}],"end_page":"818","keywords":["bioconjugation","biosurfactant","cancer therapy","folic acid receptor","graphene quantum dots","theranostic tool"],"year":"2019","subject":[{"code":"R5-920","scheme":"LCC","term":"Medicine (General)"}],"author":[{"name":"Bansal S"},{"name":"Singh J"},{"name":"Kumari U"},{"name":"Kaur IP"},{"name":"Barnwal RP"},{"name":"Kumar R"},{"name":"Singh S"},{"name":"Singh G"},{"name":"Chatterjee M"}],"link":[{"content_type":"html","type":"fulltext","url":"https://www.dovepress.com/development-of-biosurfactant-based-graphene-quantum-dot-conjugate-as-a-peer-reviewed-article-IJN"}],"abstract":"Smriti Bansal,1 Joga Singh,2 Uma Kumari,3 Indu Pal Kaur,2 Ravi Pratap Barnwal,4 Ravinder Kumar,3 Suman Singh,5 Gurpal Singh,2 Mary Chatterjee1 1Biotechnology Engineering, University Institute of Engineering &amp; Technology, Panjab University, Chandigarh, India; 2Department of Pharmaceutical Sciences, University Institute of Pharmaceutical Sciences, Panjab University, Chandigarh, India; 3Department of Zoology, Panjab University, Chandigarh, India; 4Department of Biophysics, Panjab University, Chandigarh, India; 5Department of Agronomics, Central Scientific Instruments Organisation, Chandigarh, India Background: Biosurfactants are amphipathic molecules of microbial origin that reduce surface and interfacial tension at gas&ndash;liquid&ndash;solid interfaces. Earlier, the biosurfactant was isolated and characterized in our laboratory from Candida parapsilosis. The property of the biosurfactant is further explored in this study by using quantum dots (QDs) as nanocarrier.Materials and methods: Graphene quantum dots (GQDs) were synthesized by bottom-up approach through pyrolysis of citric acid. GQDs were conjugated with both biosurfactant and folic acid (FA) using carbodiimide chemistry. The prepared GQD bioconjugate was studied for diagnostic and therapeutic effects against cancer cells.Results and discussion: Photoluminescence quantum yield (QY) of plain GQDs was measured as 12.8%. QY for biosurfactant conjugated GQDs and FA-biosurfactant conjugated GQDs was measured as 10.4% and 9.02%, respectively, and it was sufficient for targeting cancer cells. MTT assay showed that more than 90% of cells remained viable at concentration of 1 mg/mL, hence GQDs seemed to be non-toxic to cells. Biosurfactant conjugated GQDs caused 50% reduction in cellular viability within 24 hours. FA conjugation further increased the specificity of bioconjugated GQDs toward tumor cells, which is clearly evident from the drug internalization studies using confocal laser scanning microscopy. A higher amount of drug uptake was observed when bioconjugated GQDs were decorated with FA.Conclusion: The ability of GQD bioconjugate could be used as a theranostic tool for cancer. It is foreseen that in near future cancer can be detected and/or treated at an early stage by utilizing biosurfactant conjugated GQDs. Therefore, the proposed study would provide a stepping stone to improve the life of cancer patients. Keywords: bioconjugation, nanomedicine, nanocarrier, cancer therapy, folic acid receptor, graphene quantum dots","title":"Development of biosurfactant-based graphene quantum dot conjugate as a novel and fluorescent theranostic tool for cancer","journal":{"volume":"Volume 14","country":"GB","license":[{"open_access":true,"title":"CC BY-NC","type":"CC BY-NC","url":"https://www.dovepress.com/author_guidelines.php?content_id=695"}],"issns":["1176-9114","1178-2013"],"publisher":"Dove Medical Press","language":["EN"],"title":"International Journal of Nanomedicine"},"month":"1","start_page":"809"},"created_date":"2019-01-29T18:43:40Z","id":"e0173c80437f4fb88ec4e02e453e13b0"}
+{"last_updated":"2020-02-04T09:46:14Z","bibjson":{"identifier":[{"id":"1424-8220","type":"eissn"},{"id":"10.3390/s18124467","type":"doi"}],"journal":{"volume":"18","number":"12","country":"CH","license":[{"open_access":true,"title":"CC BY","type":"CC BY","url":"http://www.mdpi.com/journal/sensors/about"}],"issns":["1424-8220"],"publisher":"MDPI AG","language":["EN"],"title":"Sensors"},"month":"12","keywords":["multilayer sea ice temperature","low temperature","design","performance analysis"],"year":"2018","start_page":"4467","subject":[{"code":"TP1-1185","scheme":"LCC","term":"Chemical technology"}],"author":[{"affiliation":"College of Electrical and Power Engineering, Taiyuan University of Technology, Taiyuan 030024, China","name":"Guangyu Zuo"},{"affiliation":"College of Electrical and Power Engineering, Taiyuan University of Technology, Taiyuan 030024, China","name":"Yinke Dou"},{"affiliation":"College of Water Resources Science and Engineering, Taiyuan University of Technology, Taiyuan 030024, China","name":"Xiaomin Chang"},{"affiliation":"College of Electrical and Power Engineering, Taiyuan University of Technology, Taiyuan 030024, China","name":"Yan Chen"},{"affiliation":"College of Electrical and Power Engineering, Taiyuan University of Technology, Taiyuan 030024, China","name":"Chunyan Ma"}],"link":[{"content_type":"pdf","type":"fulltext","url":"https://www.mdpi.com/1424-8220/18/12/4467"}],"abstract":"Temperature profiles of sea ice have been recorded more than a few decades. However, few high-precision temperature sensors can complete the observation of temperature profile of sea ice, especially in extreme environments. At present, the most widely used sea ice observation instruments can reach an accuracy of sea ice temperature measurement of 0.1 &#176;C. In this study, a multilayer sea ice temperature sensor is developed with temperature measurement accuracy from &#8722;0.0047 &#176;C to 0.0059 &#176;C. The sensor system composition, structure of the thermistor string, and work mode are analyzed. The performance of the sensor system is evaluated from &#8722;50 &#176;C to 30 &#176;C. The temperature dependence of the constant current source, the amplification circuit, and the analog-to-digital converter (ADC) circuit are comprehensive tested and quantified. A temperature correction algorithm is designed to correct any deviation in the sensor system. A sea-ice thickness discrimination algorithm is proposed in charge of determining the thickness of sea ice automatically. The sensor system was field tested in Wuliangsuhai, Yellow River on 31 January 2018 and the second reservoir of Fen River, Yellow River on 30 January 2018. The integral practicality of this sensor system is identified and examined. The multilayer sea ice temperature sensor will provide good temperature results of sea ice and maintain stable performance in the low ambient temperature.","title":"Design and Performance Analysis of a Multilayer Sea Ice Temperature Sensor Used in Polar Region"},"admin":{"seal":true},"created_date":"2018-12-18T08:13:29Z","id":"152f83d12b9f477696e681684ba696e7"}
+{"last_updated":"2020-06-02T23:02:32Z","bibjson":{"identifier":[{"id":"10.123/abc","type":"doi"},{"id":"2076-3417","type":"eissn"}],"journal":{"volume":"10","number":"3872","country":"CH","license":[{"open_access":true,"title":"CC BY","type":"CC BY","url":"http://www.mdpi.com/about/openaccess"}],"issns":["2076-3417"],"publisher":"MDPI AG","language":["EN"],"title":"Applied Sciences"},"month":"06","keywords":["Smart parking systems","survey","vehicle routing problem","vehicle detection techniques","routing algorithms"],"year":"2020","start_page":"3872","subject":[{"code":"T","scheme":"LCC","term":"Technology"},{"code":"TA1-2040","scheme":"LCC","term":"Engineering (General). Civil engineering (General)"},{"code":"QH301-705.5","scheme":"LCC","term":"Biology (General)"},{"code":"QC1-999","scheme":"LCC","term":"Physics"},{"code":"QD1-999","scheme":"LCC","term":"Chemistry"}],"author":[{"affiliation":"Institute of Computer Science. Faculty of Exact, Physical and Natural Sciences. National University of San Juan, 5400 San Juan, Argentina","name":"Mathias Gabriel Diaz Ogás"},{"affiliation":"Institute of Informatics and Applications. University of Girona, 17003 Girona, Spain","name":"Ramon Fabregat"},{"affiliation":"Institute of Computer Science. Faculty of Exact, Physical and Natural Sciences. National University of San Juan, 5400 San Juan, Argentina","name":"Silvana Aciar"}],"link":[{"content_type":"text/html","type":"fulltext","url":"https://www.mdpi.com/2076-3417/10/11/3872"}],"abstract":"The large number of vehicles constantly seeking access to congested areas in cities means that finding a public parking place is often difficult and causes problems for drivers and citizens alike. In this context, strategies that guide vehicles from one point to another, looking for the most optimal path, are needed. Most contributions in the literature are routing strategies that take into account different criteria to select the optimal route required to find a parking space. This paper aims to identify the types of smart parking systems (SPS) that are available today, as well as investigate the kinds of vehicle detection techniques (VDT) they have and the algorithms or other methods they employ, in order to analyze where the development of these systems is at today. To do this, a survey of 274 publications from January 2012 to December 2019 was conducted. The survey considered four principal features: SPS types reported in the literature, the kinds of VDT used in these SPS, the algorithms or methods they implement, and the stage of development at which they are. Based on a search and extraction of results methodology, this work was able to effectively obtain the current state of the research area. In addition, the exhaustive study of the studies analyzed allowed for a discussion to be established concerning the main difficulties, as well as the gaps and open problems detected for the SPS. The results shown in this study may provide a base for future research on the subject.","title":"Survey of Smart Parking Systems"},"admin":{"seal":true},"id":"9cf511bab39445ba9745feb43d7493dd","created_date":"2020-06-03T00:02:28Z"}
diff --git a/python/tests/files/example_ingest.json b/python/tests/files/example_ingest.json
index cea67fa7..a9791587 100644
--- a/python/tests/files/example_ingest.json
+++ b/python/tests/files/example_ingest.json
@@ -1,2 +1,2 @@
-{"file_meta": {"sha1hex": "00242a192acc258bdfdb151943419437f440c313", "md5hex": "f4de91152c7ab9fdc2a128f962faebff", "sha256hex": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size_bytes": 255629, "mimetype": "application/pdf"}, "request": {"ingest_request_source": "fatcat-changelog", "link_source": "doi", "link_source_id":"10.123/abc","ext_ids": {"doi": "10.123/abc"}}, "terminal": {"terminal_url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable", "terminal_dt": "20170227164644", "terminal_sha1hex": "00242a192acc258bdfdb151943419437f440c313"}, "cdx": { "datetime": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "grobid": {"status_code": 200 }, "hit": true, "status": "success"}
+{"file_meta": {"sha1hex": "00242a192acc258bdfdb151943419437f440c313", "md5hex": "f4de91152c7ab9fdc2a128f962faebff", "sha256hex": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size_bytes": 255629, "mimetype": "application/pdf"}, "request": {"ingest_type": "pdf", "ingest_request_source": "fatcat-changelog", "link_source": "doi", "link_source_id":"10.123/abc","ext_ids": {"doi": "10.123/abc"}}, "terminal": {"terminal_url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable", "terminal_dt": "20170227164644", "terminal_sha1hex": "00242a192acc258bdfdb151943419437f440c313"}, "cdx": { "datetime": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "grobid": {"status_code": 200 }, "hit": true, "status": "success"}
{"request":{"ingest_type":"pdf","ingest_request_source":"fatcat-changelog","base_url":"https://doi.org/10.3917/popav.748.0017","release_stage":"published","fatcat":{"release_ident":"weeqjkvsx5abze2bhithyrx6wu","work_ident":"ujatsk25yrdw5gofubw7nogzgq"},"ext_ids":{"doi":"10.3917/popav.748.0017"},"link_source":"doi","link_source_id":"10.3917/popav.748.0017"},"hit":false,"hops":["https://doi.org/10.3917/popav.748.0017"],"status":"wayback-error","error_message":"replay fetch didn't return X-Archive-Src in headers"}
diff --git a/python/tests/files/example_ingest_html.json b/python/tests/files/example_ingest_html.json
new file mode 100644
index 00000000..6c646814
--- /dev/null
+++ b/python/tests/files/example_ingest_html.json
@@ -0,0 +1 @@
+{"cdx": {"datetime": "20200708025309", "mimetype": "text/html", "sha1b32": "THJFFZJR2VYN2FAR7X7LHFGRU2X5IC2U", "sha1hex": "99d252e531d570dd1411fdfeb394d1a6afd40b54", "status_code": 200, "surt": "py,una,iics,scielo)/scielo.php?lng=en&nrm=iso&pid=s1683-98032015000200002&script=sci_arttext&tlng=es", "url": "http://scielo.iics.una.py/scielo.php?script=sci_arttext&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=es", "warc_csize": 13123, "warc_offset": 77579308, "warc_path": "SCIELO-CRAWL-2020-07-20200707211940442-00279-00347-wbgrp-svc206/SCIELO-CRAWL-2020-07-20200708024511243-00332-13069~wbgrp-svc206.us.archive.org~8443.warc.gz"}, "file_meta": {"md5hex": "515a61845a2f898438e3986e4506da8f", "mimetype": "text/html", "sha1hex": "99d252e531d570dd1411fdfeb394d1a6afd40b54", "sha256hex": "c4559d548476a325891461b71c796beee717e820d6a00cb8411176ce83a0f23f", "size_bytes": 47442}, "hit": true, "hops": ["http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "http://scielo.iics.una.py/scielo.php?script=sci_arttext&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=es"], "html_biblio": {"container_issn": "1683-9803", "container_name": "Pediatr\u00eda (Asunci\u00f3n)", "contrib_names": ["Ruiz Valiente, Syntia Carolina", "Ruiz Ca\u00f1ete, Manuel", "Cohene Velazquez, Bartola"], "doi": "10.18004/ped.2015.agosto.102-107", "first_page": "102", "html_fulltext_url": "http://scielo.iics.una.py/scielo.php?script=sci_arttext&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=es", "issue": "2", "last_page": "107", "pdf_fulltext_url": "http://scielo.iics.una.py/pdf/ped/v42n2/v42n2a02.pdf", "publisher": "Sociedad Paraguaya de Pediatr\u00eda", "release_date": "2015-08-06", "title": "Prevalence of malnutrition and eating habits in children under 5 years of age in indigenous communities in Azote'y and Yby Yau, 2011", "volume": "42", "xml_fulltext_url": "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en"}, "html_body": {"agent": "trafilatura/0.5.1", "status": "success", "word_count": 3500}, "html_resources": [{"mimetype": "image/gif", "resource_type": "image", "sha1hex": "4991aa771874daf8cba79be38d18d534f946b5d6", "sha256hex": "5e76fad755b873a439dd5e775684696c547008d45cc901606132e9a1ed970757", "size": 220, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/alpha.gif", "timestamp": "2020-10-31T14:07:30", "url": "http://scielo.iics.una.py/img/en/alpha.gif"}, {"mimetype": "text/plain", "resource_type": "script", "sha1hex": "fd28e342fa1b40b84cc17dc66d22df3bf260170b", "sha256hex": "9cf2e81dd65d5a64200970bbd1cd9497b46b2af232e2fbfb79fef95b070f23d1", "size": 3653, "status_code": 200, "surt": "py,una,iics,scielo)/applications/scielo-org/js/toolbox.js", "timestamp": "2020-10-31T20:14:35", "url": "http://scielo.iics.una.py/applications/scielo-org/js/toolbox.js"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "c5ea6229ce6a97f2dc2b2e2c8ffac26400dfcd58", "sha256hex": "7fb3d59ea14ab060c2b6cbdd5e63d57e158d6cc9e613ceb05ab1e6ec60d64995", "size": 382, "status_code": 200, "surt": "py,una,iics,scielo)/img/common/iconpermalink.gif", "timestamp": "2020-10-31T20:14:52", "url": "http://scielo.iics.una.py/img/common/iconPermalink.gif"}, {"mimetype": "image/jpeg", "resource_type": "image", "sha1hex": "fbd3488e6b8cd241605fa2db14ba15e0f037d3a7", "sha256hex": "5492829967d521386bec4323f0d7ef951e9a0b16caa1bcd8e75576dc41bd3b55", "size": 26759, "status_code": 200, "surt": "py,una,iics,scielo)/img/revistas/ped/v42n2/2a02f1.jpg", "timestamp": "2020-07-08T02:53:11", "url": "http://scielo.iics.una.py/img/revistas/ped/v42n2/2a02f1.jpg"}, {"mimetype": "image/jpeg", "resource_type": "image", "sha1hex": "9f1833948223109dfaca2c37fbdbacb81002a346", "sha256hex": "f5b08a2022fce73ae04c3b9fe368645a084132a942bb29950bba705ed89e6d91", "size": 35440, "status_code": 200, "surt": "py,una,iics,scielo)/img/revistas/ped/v42n2/2a02t1.jpg", "timestamp": "2020-07-08T02:53:18", "url": "http://scielo.iics.una.py/img/revistas/ped/v42n2/2a02t1.jpg"}, {"mimetype": "image/png", "resource_type": "image", "sha1hex": "0d2d329000cba763e5eec45bd8ee2743393ebd62", "sha256hex": "d964eed5974264b8f107a905b74796cb3d5e60f78da1c500bb547a419538915e", "size": 3091, "status_code": 200, "surt": "py,una,iics,scielo)/img/common/icon-close.png", "timestamp": "2020-10-24T10:17:13", "url": "http://scielo.iics.una.py/img/common/icon-close.png"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "5812dc64389992d7d59d10e57449407778bbd0c0", "sha256hex": "605ce931ded871d924f31765c6bbf778eb8b5194b3396f49638a88331f53dc21", "size": 652, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/iconxmldocument.gif", "timestamp": "2020-10-24T14:58:51", "url": "http://scielo.iics.una.py/img/en/iconXMLDocument.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "f20584095b9c7d06250140bf7f51f7bd91e2ba08", "sha256hex": "f8292c0c25d5eec546fe16e8a53101b4933adb2e75e58d7335158dc94b2bae91", "size": 239, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/artsrc.gif", "timestamp": "2020-10-29T17:34:52", "url": "http://scielo.iics.una.py/img/en/artsrc.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "09fe461f38958a267695edf5675f668323f754ec", "sha256hex": "d0792cfc52df6414126a541e8cd32ba151d75f87225c63d38a9ddad389b913b3", "size": 229, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/subject.gif", "timestamp": "2020-10-28T12:59:36", "url": "http://scielo.iics.una.py/img/en/subject.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "7b2f78593847928d8f0f8a2068b0cb366501c3e5", "sha256hex": "97dfc989c7af7a0139950696e533fe71c373539091200edba96f151efb045f8d", "size": 181, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/grp1c.gif", "timestamp": "2020-10-24T10:17:25", "url": "http://scielo.iics.una.py/img/en/grp1c.gif"}, {"mimetype": "image/jpeg", "resource_type": "image", "sha1hex": "a1e6d8818d56678a52a18859b0cf919b8663a5aa", "sha256hex": "2d34923f1bb8e417a4c244ba5be13b7fe52e0dc6dba9dbcdf512a9fb3cb84d91", "size": 27383, "status_code": 200, "surt": "py,una,iics,scielo)/img/revistas/ped/v42n2/2a02f2.jpg", "timestamp": "2020-07-08T02:53:13", "url": "http://scielo.iics.una.py/img/revistas/ped/v42n2/2a02f2.jpg"}, {"mimetype": "text/plain", "resource_type": "stylesheet", "sha1hex": "3754bfd4a8608ec125c79ccc7b62ead02c323bbc", "sha256hex": "4dc9b9edd3fc1e58d7a1c39c64551ac07530bedf0721323fc2c820a90a7b4a64", "size": 87, "status_code": 200, "surt": "py,una,iics,scielo)/css/screen.css", "timestamp": "2020-10-24T11:51:22", "url": "http://scielo.iics.una.py/css/screen.css"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "818ff217eae41fe796f21e9b56336011d8806de0", "sha256hex": "a3853400c16b0628dd226487d1ad7710f44a2e6ea8de85f2b2a6a34b7334d5b6", "size": 210, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/search.gif", "timestamp": "2020-10-28T12:59:22", "url": "http://scielo.iics.una.py/img/en/search.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "a6300c0530bdc13b1bf75a7f380cef6c1be48cc7", "sha256hex": "aa7fa5a5bedea888ddbb89f20838207eb303323c98c414452f632f96acaccbfe", "size": 660, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/iconemail.gif", "timestamp": "2020-11-01T12:16:11", "url": "http://scielo.iics.una.py/img/en/iconEmail.gif"}, {"mimetype": "image/png", "resource_type": "image", "sha1hex": "100a6b57582fbf383f96c289c92fbbc9aaa63f06", "sha256hex": "f43d4d35e7ac1e815dc0c8897806e30d928ee62e1aa6ac20f49c649f8b694004", "size": 430, "status_code": 200, "surt": "net,licensebuttons)/l/by/4.0/80x15.png", "timestamp": "2020-07-08T21:51:45", "url": "https://licensebuttons.net/l/by/4.0/80x15.png"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "dbfce64d671bbb03591a297983c81ede279b051d", "sha256hex": "2ef85ef9dd7926099287dd33ab43fc6819b393446e85ddb754897ad457c56282", "size": 244, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/prev.gif", "timestamp": "2020-10-29T01:22:04", "url": "http://scielo.iics.una.py/img/en/prev.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "bf5bed62fc6cc82a8a7e862fceac9ce8ffb12cb8", "sha256hex": "1a90de599f61e3191fec24d504798c372b72ccbc511c3c44d48070a0dddefe25", "size": 262, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/iconrelatedoff.gif", "timestamp": "2020-10-28T12:59:51", "url": "http://scielo.iics.una.py/img/en/iconRelatedOff.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "4a59b2b1d57a210252311d563eea138afcc7a886", "sha256hex": "f96585d38fb34040d9bd81e83538a7beade916bc2d4456e75d5911181281cb6f", "size": 586, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/e-mailt.gif", "timestamp": "2020-10-31T20:53:28", "url": "http://scielo.iics.una.py/img/en/e-mailt.gif"}, {"mimetype": "image/jpeg", "resource_type": "image", "sha1hex": "d465c5da9dea2a6e10b5d340c5af6af6cc10f3ec", "sha256hex": "ffc1411a8185c8df5ca9c0725fbfab41380706d213ca42c8552f32323b67901d", "size": 33992, "status_code": 200, "surt": "py,una,iics,scielo)/img/revistas/ped/v42n2/2a02f3.jpg", "timestamp": "2020-07-08T02:53:19", "url": "http://scielo.iics.una.py/img/revistas/ped/v42n2/2a02f3.jpg"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "86dbd3881975bc15fef15536e5cbd54bad53271c", "sha256hex": "75cbc76c44915b46c6c44fdeeeabd1bfab774ecf692d37ed8d1b4674f5ee583d", "size": 628, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/iconpdfdocument.gif", "timestamp": "2020-10-24T11:51:12", "url": "http://scielo.iics.una.py/img/en/iconPDFDocument.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "080333f92aa899e53bcd30a4e734d9b36d7ac7a4", "sha256hex": "e6834ac24d48ec9d75b178de59964eb9fb66e9cff05b439ad247f5af5d5fc1ff", "size": 374, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/iconreferences.gif", "timestamp": "2020-10-29T05:40:05", "url": "http://scielo.iics.una.py/img/en/iconReferences.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "fe879762fd80c756df0af9c81e3424d651fa1b6a", "sha256hex": "4333f6c0ccd89f3240b6c8bb9b2c109792da6d0513e618c35033e2474981b55d", "size": 578, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/icontranslation.gif", "timestamp": "2020-10-28T22:59:05", "url": "http://scielo.iics.una.py/img/en/iconTranslation.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "8846563b9722db2f3b832c03ad2ee9b6318c1d0e", "sha256hex": "6843f628c71f39631ec5d501f6b62506ae9f8454c0a3cd957f4dc67985c371bb", "size": 219, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/author.gif", "timestamp": "2020-10-24T10:16:50", "url": "http://scielo.iics.una.py/img/en/author.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "db35a7e4171a130d25632d6e1ba9c3806eec1e87", "sha256hex": "bd6496501a92a6ed3c5e8c16ce0af4ac9b4cece3562934010d0878f6ea06ead0", "size": 288, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/iconcitedoff.gif", "timestamp": "2020-10-29T17:34:57", "url": "http://scielo.iics.una.py/img/en/iconCitedOff.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "0d844cf48e3ed3849d7e2deed30fb2e7318107b0", "sha256hex": "f257802855722fed0b2b6936a9aede3a4869fc347e91860a26cc81c1ba9df3a3", "size": 164, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/toc.gif", "timestamp": "2020-10-28T22:59:18", "url": "http://scielo.iics.una.py/img/en/toc.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "33e2d3699619eb6dac7c91c207c748599def84f0", "sha256hex": "d8af84c5c4c10e724a081409b0f0e50eb08b9c2cd3d3e0ee0b33cc9eaa20086c", "size": 193, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/next.gif", "timestamp": "2020-11-01T12:16:02", "url": "http://scielo.iics.una.py/img/en/next.gif"}, {"mimetype": "text/html", "resource_type": "script", "sha1hex": "731cf720a546953efe311566a2d874fae715bfc6", "sha256hex": "0d3602fb417d811e15e1a7bd6725384e5bda874dab0eb7be7ee59cd26d64dbd1", "size": 8231, "status_code": 200, "surt": "py,una,iics,scielo)/article.js", "timestamp": "2020-10-29T05:40:31", "url": "http://scielo.iics.una.py/article.js"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "756779e5ff89d107b2eb4843cc47dd4b63efc829", "sha256hex": "b52a6dc8cbbf4212790cf57af7489b9dc21c040ae7372c07bb6aa18473098759", "size": 190, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/home.gif", "timestamp": "2020-07-06T21:30:58", "url": "http://scielo.iics.una.py/img/en/home.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "e541555caec87c313c9c1859c4b5913b31f955c5", "sha256hex": "64eeb2c0e97f96d9144aa83d027fb5b9d57d96c74681611c39af99d49b148c6e", "size": 643, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/fulltxt.gif", "timestamp": "2020-10-29T22:47:53", "url": "http://scielo.iics.una.py/img/en/fulltxt.gif"}, {"mimetype": "text/plain", "resource_type": "script", "sha1hex": "65cbff4e9d95d47a6f31d96ab4ea361c1f538a7b", "sha256hex": "e23a2a4e2d7c2b41ebcdd8ffc0679df7140eb7f52e1eebabf827a88182643c59", "size": 72174, "status_code": 200, "surt": "py,una,iics,scielo)/applications/scielo-org/js/jquery-1.4.2.min.js", "timestamp": "2020-07-06T21:17:01", "url": "http://scielo.iics.una.py/applications/scielo-org/js/jquery-1.4.2.min.js"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "20368dd206a00eaf8bb117f98291a30eb0cc8e73", "sha256hex": "534434f1716e29928e0376d0e5dc113808c96d9cedab8675adff7dbf22cb9fd1", "size": 1353, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/fbpelogp.gif", "timestamp": "2020-07-06T21:16:38", "url": "http://scielo.iics.una.py/img/en/fbpelogp.gif"}], "request": {"link_source": "doi", "ingest_request_source": "fatcat-changelog", "base_url": "http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "ext_ids": {"doi": "10.123/abc"}, "fatcat": {"release_ident": null}, "ingest_type": "html"}, "scope": "article-fulltext", "status": "success", "terminal": {"terminal_dt": "20200708025309", "terminal_sha1hex": "99d252e531d570dd1411fdfeb394d1a6afd40b54", "terminal_status_code": 200, "terminal_url": "http://scielo.iics.una.py/scielo.php?script=sci_arttext&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=es"}}
diff --git a/python/tests/files/example_ingest_xml.json b/python/tests/files/example_ingest_xml.json
new file mode 100644
index 00000000..2f525998
--- /dev/null
+++ b/python/tests/files/example_ingest_xml.json
@@ -0,0 +1 @@
+{"cdx": {"datetime": "20200710091403", "mimetype": "text/xml", "sha1b32": "PWMQ2L4RHPJ3NVWC66GIJC36L5FXPOM6", "sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "status_code": 200, "surt": "py,una,iics,scielo)/scieloorg/php/articlexml.php?lang=en&pid=s1683-98032015000200002", "url": "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en", "warc_csize": 12108, "warc_offset": 94730348, "warc_path": "SCIELO-CRAWL-2020-07-20200710082036515-00773-00843-wbgrp-svc206/SCIELO-CRAWL-2020-07-20200710085423121-00779-13069~wbgrp-svc206.us.archive.org~8443.warc.gz"}, "file_meta": {"md5hex": "cda133a706ce02a07fae8bd8d2694a2a", "mimetype": "application/jats+xml", "sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "sha256hex": "be982ca211e4debb3f93f36d9f9dc1c80f99a8809eb4c41569b2b9503c27e751", "size_bytes": 49242}, "hit": true, "hops": ["http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en"], "request": {"link_source": "doi", "ingest_request_source": "fatcat-changelog","base_url": "http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "ext_ids": {"doi": "10.123/abc"}, "fatcat": {"release_ident": null}, "ingest_type": "xml"}, "status": "success", "terminal": {"terminal_dt": "20200710091403", "terminal_sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "terminal_status_code": 200, "terminal_url": "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en"}, "xml_meta": {"status": "success"}}
diff --git a/python/tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json b/python/tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json
new file mode 100644
index 00000000..1c559509
--- /dev/null
+++ b/python/tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json
@@ -0,0 +1 @@
+{"abstracts":[{"sha1":"b2523f13fc2aa730a2e2336f27d448644074e24f","content":"<p>Jakobshavn Isbræ, West Greenland, which holds a 0.6-m sea level volume equivalent, has been speeding up and retreating since the late 1990s. Interpretation of its retreat has been hindered by difficulties in measuring its ice thickness with airborne radar depth sounders. Here, we employ high-resolution, helicopter-borne gravity data from 2012 to reconstruct its bed elevation within 50 km of the ocean margin using a three-dimensional inversion constrained by fjord bathymetry data offshore and a mass conservation algorithm inland. We find the glacier trough to be asymmetric and several 100 m deeper than estimated previously in the lower part. From 1996-2016, the grounding line migrated at 0.6 km/yr from 700 m to 1,100 m depth. Upstream, the bed drops to 1,600 m over 10 km then slowly climbs to 1,200 m depth in 40 km. Jakobshavn Isbræ will continue to retreat along a retrograde slope for decades to come.\n\nAn L., E. Rignot, S.H.P. Elieff, M. Morlighem, R. Millan, J. Mouginot, D.M. Holland, D. Holland, and J. Paden (2017), Bed elevation of Jakobshavn Isbræ, West Greenland, from high-resolution airborne gravity and other data, Geophys. Res. Lett., 44, doi:10.1002/2017GL073245.\n\n</p>","mimetype":"text/html"}],"refs":[],"contribs":[{"raw_name":"Lu An","role":"author","raw_affiliation":"University of California, Irvine"}],"license_slug":"CC-BY","publisher":"UC Irvine","ext_ids":{"doi":"10.7280/d1j37z"},"release_year":2018,"release_type":"dataset","webcaptures":[],"filesets":[{"release_ids":["3mssw2qnlnblbk7oqyv2dafgey"],"urls":[{"url":"https://merritt.cdlib.org/u/ark%3A%2F13030%2Fm5rg0r8q/1","rel":"repo-bundle"},{"url":"https://merritt.cdlib.org/d/ark%3A%2F13030%2Fm5rg0r8q/1/","rel":"repo"},{"url":"dat://77e94744aa5f967e6ed7e3990bfc29f141dbf2c0fff572eb1212b3bd706882f4/files/","rel":"dweb"}],"manifest":[{"path":"JKS_BedElevation_An_etal_2017.nc","size":736484,"md5":"af738fa325833a56bf947622958fd504","sha1":"443f1867b3a56132905e8d611ad03445d8134d3c","sha256":"52438ef0035b391027e989f00208de5c16ab8f9ff619aa7f45e998d6214a452f","extra":{"mimetype":"application/x-netcdf"}}],"state":"active","ident":"ho376wmdanckpp66iwfs7g22ne","revision":"e07ab7b0-bc0e-4da2-9121-542263e84e2d","extra":{"cdl_dash":{"version":1}}}],"files":[],"work_id":"pbf2dmuu5jf4dac2k22gxsjk6y","title":"Jakobshavn Glacier Bed Elevation","state":"active","ident":"3mssw2qnlnblbk7oqyv2dafgey","revision":"23040a75-2aa6-49f2-af3c-a5c12dcceffe","extra":{"ark_id":"ark:/13030/m5rg0r8q","cdl_dash":{"version":1}}} \ No newline at end of file
diff --git a/python/tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json b/python/tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json
new file mode 100644
index 00000000..3bfe8564
--- /dev/null
+++ b/python/tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json
@@ -0,0 +1 @@
+{"abstracts":[],"refs":[],"contribs":[{"index":0,"raw_name":"Catherine C. Marshall","role":"author","extra":{"seq":"first"}}],"language":"en","publisher":"CNRI Acct","issue":"3/4","volume":"14","ext_ids":{"doi":"10.1045/march2008-marshall-pt1"},"release_year":2008,"release_stage":"published","release_type":"article-journal","container_id":"ugbiirfvufgcjkx33r3cmemcuu","webcaptures":[{"release_ids":["mjtqtuyhwfdr7j2c3l36uor7uy"],"timestamp":"2019-01-06T18:58:12Z","original_url":"http://www.dlib.org/dlib/march08/marshall/03marshall-pt1.html","archive_urls":[{"url":"https://web.archive.org/web/","rel":"wayback"}],"cdx":[{"surt":"org,dlib)/dlib/march08/images/spacer00.gif","timestamp":"2019-01-06T19:50:55Z","url":"http://www.dlib.org/dlib/march08/images/spacer00.gif","mimetype":"image/gif","status_code":200,"sha1":"0e75513436e6b01963759f6a88282445ff2e5b3a","sha256":"7455bacb03f7ef04d79010638db14d8434cf7a349914c2ee99eb5d4220338675"},{"surt":"org,dlib)/dlib/march08/marshall/marshall-part1-fig1.png","timestamp":"2019-01-06T19:51:01Z","url":"http://www.dlib.org/dlib/march08/marshall/marshall-part1-fig1.png","mimetype":"image/png","status_code":200,"sha1":"89cee41b938a1d2cdc51688b4be1c72366ae8102","sha256":"d63abfb99c9c48e1e6e3e37bbc5f01c0d37429f0ac0a404ae6aadc1a7d187b60"},{"surt":"org,dlib)/dlib/march08/images/redline00.gif","timestamp":"2019-01-06T19:50:55Z","url":"http://www.dlib.org/dlib/march08/images/redline00.gif","mimetype":"image/gif","status_code":200,"sha1":"3a902e1d6075e37962ab37afc1567819bc3a164e","sha256":"3279d6916807f9e244beb23c91d58cd238509f77a26c06b14314f276b77b9c06"},{"surt":"org,dlib)/dlib/march08/images/commentary00.gif","timestamp":"2019-01-06T19:50:55Z","url":"http://www.dlib.org/dlib/march08/images/commentary00.gif","mimetype":"image/gif","status_code":200,"sha1":"cdbf8804daa2627ef915db725b29cce9eaa9cd68","sha256":"8d8956e992a7f3004ccbbaaebe585ee4c2b1256ad418507d7c33f94b290d0b04"},{"surt":"org,dlib)/dlib/march08/style/main.css","timestamp":"2019-01-06T19:50:55Z","url":"http://www.dlib.org/dlib/march08/style/main.css","mimetype":"text/css","status_code":200,"sha1":"425f00efb41156f03d5c139c1b24acfcbdd611cb","sha256":"ff811660270fc847b5efc3ff9d62967244c924f91a5e4796ac2e6fc8058440ff"},{"surt":"org,dlib)/dlib/march08/marshall/03marshall-pt1.html","timestamp":"2018-12-06T13:16:33Z","url":"http://www.dlib.org/dlib/march08/marshall/03marshall-pt1.html","mimetype":"text/html","status_code":200,"sha1":"8443a044aa1f4571dd1e5561d59150e34eff0dd2","sha256":"0e9c76cdf20db60b93f0d129e5336e5344aae8bd03c5dbd75a5eea8f5d1820da"}],"revision":"6019e2a1-3503-4e91-97ec-5fba3abc70af","ident":"z7uaeatyvfgwdpuxtrdu4okqii","state":"active"}],"filesets":[],"files":[],"container":{"wikidata_qid":"Q5203268","issnl":"1082-9873","publisher":"Corporation for National Research Initiatives","name":"D-Lib Magazine","extra":{"abbrev":"Dlib Mag","country":"us","issne":"1082-9873","road":{"as_of":"2018-01-24"},"szczepanski":{"as_of":"2018"},"urls":["http://www.dlib.org/"]},"revision":"3957936f-d418-4006-b830-71341068121c","ident":"ugbiirfvufgcjkx33r3cmemcuu","state":"active"},"work_id":"kqi27ogvjvcrtnritxwumkebya","title":"Rethinking Personal Digital Archiving, Part 1","state":"active","ident":"mjtqtuyhwfdr7j2c3l36uor7uy","revision":"74270e11-c961-47f7-a682-1f6ad5927205","extra":{"crossref":{"type":"journal-article"},"subtitle":["Four Challenges from the Field"]}}
diff --git a/python/tests/import_doaj.py b/python/tests/import_doaj.py
new file mode 100644
index 00000000..d69aebd7
--- /dev/null
+++ b/python/tests/import_doaj.py
@@ -0,0 +1,142 @@
+
+import json
+import datetime
+
+import pytest
+import fatcat_openapi_client
+
+from fatcat_tools.importers import DoajArticleImporter, JsonLinePusher
+from fatcat_tools.transforms import entity_to_dict
+from fixtures import *
+
+
+@pytest.fixture(scope="function")
+def doaj_importer(api):
+ with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
+ yield DoajArticleImporter(api, issn_file, bezerk_mode=True)
+
+def test_doaj_importer(doaj_importer):
+ last_index = doaj_importer.api.get_changelog(limit=1)[0].index
+ with open("tests/files/example_doaj_articles.json", "r") as f:
+ doaj_importer.bezerk_mode = True
+ counts = JsonLinePusher(doaj_importer, f).run()
+ assert counts["insert"] == 5
+ assert counts["exists"] == 0
+ assert counts["skip"] == 0
+ success_changelog = doaj_importer.api.get_changelog(limit=1)[0]
+ assert last_index + 1 == success_changelog.index
+
+ # fetch most recent editgroup
+ change = doaj_importer.api.get_changelog_entry(index=last_index + 1)
+ eg = change.editgroup
+ assert eg.description
+ assert "doaj" in eg.description.lower()
+ assert eg.extra["git_rev"]
+ assert "fatcat_tools.DoajArticleImporter" in eg.extra["agent"]
+
+ last_index = doaj_importer.api.get_changelog(limit=1)[0].index
+ with open("tests/files/example_doaj_articles.json", "r") as f:
+ doaj_importer.bezerk_mode = False
+ doaj_importer.reset()
+ counts = JsonLinePusher(doaj_importer, f).run()
+ assert counts["insert"] == 0
+ assert counts["exists"] == 5
+ assert counts["skip"] == 0
+ assert last_index == doaj_importer.api.get_changelog(limit=1)[0].index
+
+ # cleanup file entities (so other import tests work)
+ success_editgroup = doaj_importer.api.get_editgroup(success_changelog.editgroup_id)
+ eg = quick_eg(doaj_importer.api)
+ for release_edit in success_editgroup.edits.releases:
+ doaj_importer.api.delete_release(eg.editgroup_id, release_edit.ident)
+ doaj_importer.api.accept_editgroup(eg.editgroup_id)
+
+def test_doaj_importer_existing_doi(doaj_importer):
+ """
+ One of the DOAJ test entities has a dummy DOI (10.123/abc); this test
+ ensures that it isn't clobbered, an then that it gets updated.
+ """
+ with open("tests/files/example_doaj_articles.json", "r") as f:
+ doaj_importer.reset()
+ doaj_importer.bezerk_mode = False
+ doaj_importer.do_updates = False
+ counts = JsonLinePusher(doaj_importer, f).run()
+ print(counts)
+ assert counts["insert"] == 4
+ assert counts["exists"] == 1
+ assert counts["skip"] == 0
+ success_changelog = doaj_importer.api.get_changelog(limit=1)[0]
+ success_editgroup = doaj_importer.api.get_editgroup(success_changelog.editgroup_id)
+
+ with open("tests/files/example_doaj_articles.json", "r") as f:
+ doaj_importer.reset()
+ doaj_importer.bezerk_mode = False
+ doaj_importer.do_updates = True
+ counts = JsonLinePusher(doaj_importer, f).run()
+ print(counts)
+ assert counts["insert"] == 0
+ assert counts["exists"] == 4
+ assert counts["update"] == 1
+ update_changelog = doaj_importer.api.get_changelog(limit=1)[0]
+ update_editgroup = doaj_importer.api.get_editgroup(update_changelog.editgroup_id)
+
+ with open("tests/files/example_doaj_articles.json", "r") as f:
+ doaj_importer.reset()
+ doaj_importer.bezerk_mode = False
+ doaj_importer.do_updates = True
+ counts = JsonLinePusher(doaj_importer, f).run()
+ print(counts)
+ assert counts["insert"] == 0
+ assert counts["exists"] == 5
+ assert counts["update"] == 0
+
+ # cleanup file entities (so other import tests work)
+ eg = quick_eg(doaj_importer.api)
+ for release_edit in success_editgroup.edits.releases:
+ doaj_importer.api.delete_release(eg.editgroup_id, release_edit.ident)
+ for release_edit in update_editgroup.edits.releases:
+ print(release_edit)
+ doaj_importer.api.update_release(
+ eg.editgroup_id,
+ release_edit.ident,
+ ReleaseEntity(
+ revision=release_edit.prev_revision,
+ ext_ids=ReleaseExtIds(),
+ ),
+ )
+ doaj_importer.api.accept_editgroup(eg.editgroup_id)
+
+def test_doaj_dict_parse(doaj_importer):
+ with open("tests/files/example_doaj_articles.json", "r") as f:
+ raw = json.loads(f.readline())
+ r = doaj_importer.parse_record(raw)
+
+ assert r.title == "Effect of hydrogen on tensile properties and fracture behavior of PH 13-8 Mo steel"
+ assert r.publisher == "Elsevier"
+ assert r.release_type == "article-journal"
+ assert r.release_stage == "published"
+ assert r.license_slug == "cc-by-nc-nd"
+ assert r.original_title == None
+ assert r.ext_ids.doi == "10.1016/j.matdes.2016.06.110"
+ assert r.ext_ids.doaj == "e58f08a11ecb495ead55a44ad4f89808"
+ assert r.subtitle == None
+ assert r.release_date == None
+ assert r.release_year == 2016
+ assert r.volume == "108"
+ assert r.number == None
+ assert r.pages == "608-617"
+ assert r.version == None
+ assert r.language == "en"
+ # matched by ISSN, so wouldn't be defined normally
+ assert r.extra['container_name'] == "Materials & Design"
+ assert len(r.abstracts) == 1
+ assert len(r.abstracts[0].content) == 1033
+ assert len(r.contribs) == 5
+ assert r.contribs[0].raw_name == "Xinfeng Li"
+ assert r.contribs[0].given_name == None
+ assert r.contribs[0].surname == None
+ assert not r.refs
+
+ #print(r.extra)
+ assert r.extra['release_month'] == 10
+ assert r.extra['country'] == 'gb'
diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py
index 4a46232a..92539f1a 100644
--- a/python/tests/import_ingest.py
+++ b/python/tests/import_ingest.py
@@ -2,7 +2,7 @@
import json
import pytest
-from fatcat_tools.importers import IngestFileResultImporter, JsonLinePusher
+from fatcat_tools.importers import IngestFileResultImporter, IngestWebResultImporter, JsonLinePusher
from fixtures import *
@@ -10,6 +10,10 @@ from fixtures import *
def ingest_importer(api):
yield IngestFileResultImporter(api)
+@pytest.fixture(scope="function")
+def ingest_web_importer(api):
+ yield IngestWebResultImporter(api)
+
# TODO: use API to check that entities actually created...
def test_ingest_importer_basic(ingest_importer):
with open('tests/files/example_ingest.json', 'r') as f:
@@ -41,6 +45,60 @@ def test_ingest_importer(ingest_importer):
assert counts['exists'] == 1
assert counts['skip'] == 1
+def test_ingest_importer_xml(ingest_importer):
+ last_index = ingest_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/example_ingest_xml.json', 'r') as f:
+ ingest_importer.bezerk_mode = True
+ counts = JsonLinePusher(ingest_importer, f).run()
+ print(counts)
+ assert counts['insert'] == 1
+ assert counts['exists'] == 0
+ assert counts['skip'] == 0
+
+ # fetch most recent editgroup
+ change = ingest_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
+ assert eg.description
+ assert "crawled from web" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.IngestFileResultImporter" in eg.extra['agent']
+
+ # re-import should skip
+ with open('tests/files/example_ingest_xml.json', 'r') as f:
+ ingest_importer.reset()
+ ingest_importer.bezerk_mode = False
+ counts = JsonLinePusher(ingest_importer, f).run()
+ assert counts['insert'] == 0
+ assert counts['exists'] == 1
+ assert counts['skip'] == 0
+
+def test_ingest_importer_web(ingest_web_importer):
+ last_index = ingest_web_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/example_ingest_html.json', 'r') as f:
+ ingest_web_importer.bezerk_mode = True
+ counts = JsonLinePusher(ingest_web_importer, f).run()
+ print(counts)
+ assert counts['insert'] == 1
+ assert counts['exists'] == 0
+ assert counts['skip'] == 0
+
+ # fetch most recent editgroup
+ change = ingest_web_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
+ assert eg.description
+ assert "crawled from web" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.IngestWebResultImporter" in eg.extra['agent']
+
+ # re-import should skip
+ with open('tests/files/example_ingest_html.json', 'r') as f:
+ ingest_web_importer.reset()
+ ingest_web_importer.bezerk_mode = False
+ counts = JsonLinePusher(ingest_web_importer, f).run()
+ assert counts['insert'] == 0
+ assert counts['exists'] == 1
+ assert counts['skip'] == 0
+
def test_ingest_importer_stage(ingest_importer, api):
"""
Tests that ingest importer correctly handles release stage matching
@@ -57,7 +115,7 @@ def test_ingest_importer_stage(ingest_importer, api):
with open('tests/files/example_ingest.json', 'r') as f:
raw = json.loads(f.readline())
for row in test_table:
- print(row)
+ #print(row)
# set dummy record stage
eg = quick_eg(api)
@@ -94,6 +152,12 @@ def test_ingest_dict_parse(ingest_importer):
def test_ingest_dict_parse_old(ingest_importer):
with open('tests/files/example_ingest.old.json', 'r') as f:
raw = json.loads(f.readline())
+
+ # ancient ingest requests had no type; skip them
+ f = ingest_importer.parse_record(raw)
+ assert f == None
+ raw['request']['ingest_type'] = 'pdf'
+
f = ingest_importer.parse_record(raw)
assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313"
assert f.md5 == "f4de91152c7ab9fdc2a128f962faebff"
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index 0d96e139..b5f23e76 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -43,7 +43,7 @@ def test_rich_elasticsearch_convert():
"year_spans": [[1200, 1300]],
},
"jstor": {
- "year_spans": [[1950, 1960], [1980, 2005]],
+ "year_spans": [[1000, 1300], [1950, 1960], [1980, 2005]],
},
},
"sherpa_romeo": {"color": "blue"},
@@ -63,17 +63,23 @@ def test_rich_elasticsearch_convert():
)]
es = release_to_elasticsearch(r)
assert es['release_year'] == r.release_year
- assert es['in_ia'] == True
- assert es['in_jstor'] == False
- assert es['in_ia_sim'] == False
- assert es['in_ia'] == True
- assert es['in_web'] == True
- assert es['in_dweb'] == True
- assert es['is_oa'] == True
- assert es['is_longtail_oa'] == False
+ assert es['file_count'] == 1
+ assert es['fileset_count'] == 0
+ assert es['webcapture_count'] == 0
assert es['ref_count'] == 2
assert es['ref_linked_count'] == 1
+ assert es['preservation'] == "bright"
+ assert es['is_oa'] == True
+ assert es['is_longtail_oa'] == False
+ assert es['is_preserved'] == True
+ assert es['in_web'] == True
+ assert es['in_dweb'] == True
+ assert es['in_ia'] == True
+ assert es['in_ia_sim'] == False
+ assert es['in_kbart'] == True
+ assert es['in_jstor'] == True
+
def test_elasticsearch_release_from_json():
r = entity_from_json(open('./tests/files/release_etodop5banbndg3faecnfm6ozi.json', 'r').read(), ReleaseEntity)
es = release_to_elasticsearch(r)
@@ -85,8 +91,59 @@ def test_elasticsearch_release_from_json():
assert es['issue'] == "11"
assert es['volume'] == "118"
assert es['number'] == None
+
+ assert es['preservation'] == "dark"
+ assert es['is_oa'] == False
+ assert es['is_longtail_oa'] == False
+ assert es['is_preserved'] == True
+ assert es['in_web'] == False
+ assert es['in_dweb'] == False
+ assert es['in_ia'] == False
assert es['in_ia_sim'] == True
assert es['in_kbart'] == True
+ assert es['in_jstor'] == False
+
+ # this release has a fileset, and no file
+ r = entity_from_json(open('./tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json', 'r').read(), ReleaseEntity)
+ es = release_to_elasticsearch(r)
+
+ assert es['title'] == "Jakobshavn Glacier Bed Elevation"
+ assert es['ident'] == "3mssw2qnlnblbk7oqyv2dafgey"
+ assert es['file_count'] == 0
+ assert es['fileset_count'] == 1
+ assert es['webcapture_count'] == 0
+
+ assert es['preservation'] == "dark"
+ assert es['is_oa'] == True
+ assert es['is_longtail_oa'] == False
+ assert es['is_preserved'] == True
+ assert es['in_web'] == True
+ assert es['in_dweb'] == True
+ assert es['in_ia'] == False
+ assert es['in_ia_sim'] == False
+ assert es['in_kbart'] == False
+ assert es['in_jstor'] == False
+
+ # this release has a web capture, and no file (edited the JSON to remove file)
+ r = entity_from_json(open('./tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json', 'r').read(), ReleaseEntity)
+ es = release_to_elasticsearch(r)
+
+ assert es['title'] == "Rethinking Personal Digital Archiving, Part 1"
+ assert es['ident'] == "mjtqtuyhwfdr7j2c3l36uor7uy"
+ assert es['file_count'] == 0
+ assert es['fileset_count'] == 0
+ assert es['webcapture_count'] == 1
+
+ assert es['preservation'] == "bright"
+ assert es['is_oa'] == True
+ assert es['is_longtail_oa'] == False
+ assert es['is_preserved'] == True
+ assert es['in_web'] == True
+ assert es['in_dweb'] == False
+ assert es['in_ia'] == True
+ assert es['in_ia_sim'] == False
+ assert es['in_kbart'] == False
+ assert es['in_jstor'] == False
def test_elasticsearch_container_transform(journal_metadata_importer):
with open('tests/files/journal_metadata.sample.json', 'r') as f:
@@ -164,9 +221,17 @@ def test_elasticsearch_release_kbart_year():
)
es = release_to_elasticsearch(r)
assert es['release_year'] == this_year
+
+ assert es['preservation'] == "none"
+ assert es['is_oa'] == True
+ assert es['is_longtail_oa'] == False
+ assert es['is_preserved'] == None
+ assert es['in_web'] == False
+ assert es['in_dweb'] == False
assert es['in_ia'] == False
+ assert es['in_ia_sim'] == False
assert es['in_kbart'] == False
- assert es['preservation'] == "none"
+ assert es['in_jstor'] == False
r.container = ContainerEntity(
name="dummy journal",
@@ -180,6 +245,14 @@ def test_elasticsearch_release_kbart_year():
)
es = release_to_elasticsearch(r)
assert es['release_year'] == this_year
+
+ assert es['preservation'] == "dark"
+ assert es['is_oa'] == True
+ assert es['is_longtail_oa'] == False
+ assert es['is_preserved'] == True
+ assert es['in_web'] == False
+ assert es['in_dweb'] == False
assert es['in_ia'] == False
+ assert es['in_ia_sim'] == False
assert es['in_kbart'] == True
- assert es['preservation'] == "dark"
+ assert es['in_jstor'] == False