aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extra/stats/2020-02-19-prod-stats.json1
-rw-r--r--extra/stats/2020-02-19-prod-table-sizes.txt46
-rw-r--r--proposals/2020_sql_size_reduction.md16
-rwxr-xr-xpython/fatcat_import.py15
-rwxr-xr-xpython/fatcat_ingest.py188
-rw-r--r--python/fatcat_tools/harvest/doi_registrars.py14
-rw-r--r--python/fatcat_tools/harvest/harvest_common.py10
-rw-r--r--python/fatcat_tools/harvest/oaipmh.py15
-rw-r--r--python/fatcat_tools/importers/__init__.py1
-rw-r--r--python/fatcat_tools/importers/common.py9
-rw-r--r--python/fatcat_tools/importers/datacite.py75
-rw-r--r--python/fatcat_tools/importers/ingest.py25
-rw-r--r--python/fatcat_tools/importers/shadow.py195
-rw-r--r--python/fatcat_tools/transforms/ingest.py8
-rw-r--r--python/fatcat_tools/workers/changelog.py52
-rw-r--r--python/fatcat_web/entity_helpers.py4
-rw-r--r--python/fatcat_web/templates/release_view.html5
-rw-r--r--python/tests/files/datacite/datacite_doc_30.json72
-rw-r--r--python/tests/files/datacite/datacite_result_30.json39
-rw-r--r--python/tests/files/example_shadow.json10
-rw-r--r--python/tests/import_datacite.py3
-rw-r--r--python/tests/import_shadow.py61
22 files changed, 768 insertions, 96 deletions
diff --git a/extra/stats/2020-02-19-prod-stats.json b/extra/stats/2020-02-19-prod-stats.json
new file mode 100644
index 00000000..a2313233
--- /dev/null
+++ b/extra/stats/2020-02-19-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":3509511,"timestamp":"2020-02-20T01:42:50.980212+00:00"}},"container":{"total":148356},"papers":{"in_kbart":60523853,"in_web":19616767,"in_web_not_kbart":8937938,"is_oa":11524180,"total":105665352},"release":{"refs_total":889522285,"total":143709455}}
diff --git a/extra/stats/2020-02-19-prod-table-sizes.txt b/extra/stats/2020-02-19-prod-table-sizes.txt
new file mode 100644
index 00000000..cab2b52e
--- /dev/null
+++ b/extra/stats/2020-02-19-prod-table-sizes.txt
@@ -0,0 +1,46 @@
+Size: 476.74G
+
+ table_name | table_size | indexes_size | total_size
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib" | 53 GB | 43 GB | 96 GB
+ "public"."release_rev" | 58 GB | 33 GB | 91 GB
+ "public"."refs_blob" | 85 GB | 2884 MB | 88 GB
+ "public"."release_edit" | 14 GB | 20 GB | 34 GB
+ "public"."work_edit" | 13 GB | 20 GB | 34 GB
+ "public"."release_ident" | 9504 MB | 15 GB | 24 GB
+ "public"."work_ident" | 9302 MB | 15 GB | 24 GB
+ "public"."abstracts" | 16 GB | 1501 MB | 18 GB
+ "public"."file_rev_url" | 9980 MB | 3550 MB | 13 GB
+ "public"."work_rev" | 6038 MB | 5825 MB | 12 GB
+ "public"."release_ref" | 3997 MB | 5690 MB | 9686 MB
+ "public"."file_rev" | 3472 MB | 5103 MB | 8574 MB
+ "public"."file_edit" | 2934 MB | 3959 MB | 6893 MB
+ "public"."release_rev_abstract" | 2402 MB | 3339 MB | 5742 MB
+ "public"."file_ident" | 1795 MB | 2437 MB | 4231 MB
+ "public"."file_rev_release" | 1651 MB | 2428 MB | 4078 MB
+ "public"."creator_edit" | 702 MB | 942 MB | 1643 MB
+ "public"."creator_rev" | 695 MB | 719 MB | 1413 MB
+ "public"."editgroup" | 761 MB | 404 MB | 1164 MB
+ "public"."creator_ident" | 474 MB | 648 MB | 1121 MB
+ "public"."release_rev_extid" | 200 MB | 312 MB | 512 MB
+ "public"."changelog" | 218 MB | 214 MB | 432 MB
+ "public"."container_rev" | 75 MB | 23 MB | 98 MB
+ "public"."container_edit" | 25 MB | 31 MB | 56 MB
+ "public"."container_ident" | 11 MB | 19 MB | 30 MB
+ "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB
+ "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB
+ "public"."auth_oidc" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_edit" | 16 kB | 48 kB | 64 kB
+ "public"."editor" | 16 kB | 48 kB | 64 kB
+ "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB
+ "public"."editgroup_annotation" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."webcapture_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."webcapture_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."webcapture_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev" | 16 kB | 16 kB | 32 kB
+ "public"."webcapture_rev" | 16 kB | 16 kB | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB
+(41 rows)
diff --git a/proposals/2020_sql_size_reduction.md b/proposals/2020_sql_size_reduction.md
index f421e455..2fa39873 100644
--- a/proposals/2020_sql_size_reduction.md
+++ b/proposals/2020_sql_size_reduction.md
@@ -52,6 +52,8 @@ Other growth is expected to be much smaller, let's say a few GB of disk.
This works out to a bit over 600 GByte total disk size.
+NOTE: math was wrong? 470 + 80 + 100 -> 650 GByte, call it 700 GByte
+
## Idea: finish `ext_id` migration and drop columns+index from `release_rev`
@@ -172,3 +174,17 @@ would drop ~20% of data size and ~20% of index size.
Would it make more sense to use {ident, editgroup} as the primary key and UNIQ,
then have a separate index on `editgroup`? On the assumption that `editgroup`
cardinality is much smaller, thus the index disk usage would be smaller.
+
+## Idea: use binary for hashes
+
+We currently store file hashes (SHA-1, SHA-256, MD5) and abstracts/`ref_blobs`
+keys as TEXT in lower-case hex encoding. Using binary instead could be as much
+as a 50% size savings for both column and index storage. The difference becomes
+more apparent when all files have all hashes populated.
+
+base32 encoded strings would be smaller (but non-negligable) savings.
+
+This change has a reasonable migration path, is entirely internal to postgres
+and fatcatd, and would be no change to API schema. Postgres also allows `hex`
+encoding on `bytea` data type, which can make reading/debugging reasonable.
+
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index ad4de0e2..843685aa 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -166,6 +166,11 @@ def run_grobid_metadata(args):
bezerk_mode=args.bezerk_mode)
LinePusher(fmi, args.tsv_file).run()
+def run_shadow_lib(args):
+ fmi = ShadowLibraryImporter(args.api,
+ edit_batch_size=100)
+ JsonLinePusher(fmi, args.json_file).run()
+
def run_wayback_static(args):
api = args.api
@@ -473,6 +478,16 @@ def main():
action='store_true',
help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
+ sub_shadow_lib = subparsers.add_parser('shadow-lib',
+ help="create release and file entities based on GROBID PDF metadata extraction")
+ sub_shadow_lib.set_defaults(
+ func=run_shadow_lib,
+ auth_var="FATCAT_AUTH_WORKER_SHADOW",
+ )
+ sub_shadow_lib.add_argument('json_file',
+ help="JSON file to import from (or stdin)",
+ default=sys.stdin, type=argparse.FileType('r'))
+
sub_wayback_static = subparsers.add_parser('wayback-static',
help="crude crawl+ingest tool for single-page HTML docs from wayback")
sub_wayback_static.set_defaults(
diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py
index 6ce36974..6fda74c5 100755
--- a/python/fatcat_ingest.py
+++ b/python/fatcat_ingest.py
@@ -11,7 +11,7 @@ import argparse
from collections import Counter
import raven
import elasticsearch
-from elasticsearch_dsl import Search
+from elasticsearch_dsl import Search, Q
from fatcat_tools import public_api, simple_kafka_producer, kafka_fail_fast
from fatcat_tools.transforms import release_ingest_request
@@ -21,45 +21,54 @@ from fatcat_tools.transforms import release_ingest_request
sentry_client = raven.Client()
-def run_ingest_container(args):
- """
- This command queries elasticsearch for releases from a given container (eg,
- journal), and prepares ingest requests for them.
-
- By default it filters to releases which don't have any fulltext files
- archived in IA, and dumps the ingest requests as JSON.
- """
+def _init_search(args):
# ensure API connection works
args.api.get_changelog()
+ client = elasticsearch.Elasticsearch(args.elasticsearch_endpoint)
+ search = Search(using=client, index="fatcat_release")
+ return search
+
+
+def _run_search_dump(args, search):
+
+ if args.dry_run:
+ print("=== THIS IS A DRY RUN ===")
+
kafka_producer = None
ingest_file_request_topic = "sandcrawler-{}.ingest-file-requests".format(args.env)
if args.enqueue_kafka:
print("Will send ingest requests to kafka topic: {}".format(ingest_file_request_topic), file=sys.stderr)
kafka_producer = simple_kafka_producer(args.kafka_hosts)
- client = elasticsearch.Elasticsearch(args.elasticsearch_endpoint)
-
- s = Search(using=client, index="fatcat_release") \
- .filter("term", in_ia=False) \
- .filter("term", is_oa=True)
-
- # filter/query by container
- if args.container_id:
- s = s.filter("term", container_id=args.container_id)
- elif args.issnl:
- s = s.filter("term", container_issnl=args.issnl)
- elif args.publisher:
- s = s.query("match", publisher=args.publisher)
- elif args.name:
- s = s.query("match", container_name=args.name)
+ if args.limit is not None:
+ search = search[:args.limit]
+
+ if args.before_year:
+ search = search \
+ .filter("exists", field="release_year") \
+ .filter("range", release_date=dict(lt=args.before_year))
+ if args.after_year:
+ search = search \
+ .filter("exists", field="release_year") \
+ .filter("range", release_date=dict(gte=args.after_year))
+
+ if not args.allow_non_oa:
+ search = search.filter("term", is_oa=True)
+
+ if args.release_types:
+ release_types = args.release_types.split(',')
+ search = search \
+ .filter("terms", release_type=release_types)
else:
- print("You must supply at least one query/filter parameter! Eg, ISSN-L", file=sys.stderr)
- sys.exit(-1)
+ search = search \
+ .filter("bool", must_not=[
+ Q("terms", release_type=["stub", "component"])
+ ])
counts = Counter({'ingest_request': 0, 'elasticsearch_release': 0, 'estimate': 0})
- counts['estimate'] = s.count()
+ counts['estimate'] = search.count()
print("Expecting {} release objects in search queries".format(counts['estimate']), file=sys.stderr)
# don't try to clean up scroll if we are connected to public server (behind
@@ -67,19 +76,24 @@ def run_ingest_container(args):
if args.elasticsearch_endpoint in (
'https://search.fatcat.wiki',
'https://search.qa.fatcat.wiki'):
- s = s.params(clear_scroll=False)
+ search = search.params(clear_scroll=False)
- results = s.scan()
+ results = search.scan()
for esr in results:
+ if args.limit and counts['ingest_request'] >= args.limit:
+ break
counts['elasticsearch_release'] += 1
release = args.api.get_release(esr.ident)
ingest_request = release_ingest_request(
release,
- ingest_request_source="fatcat-ingest-container",
+ ingest_request_source="fatcat-ingest",
)
if not ingest_request:
continue
- if kafka_producer != None:
+ counts['ingest_request'] += 1
+ if args.dry_run:
+ continue
+ if kafka_producer is not None:
kafka_producer.produce(
ingest_file_request_topic,
json.dumps(ingest_request).encode('utf-8'),
@@ -87,12 +101,73 @@ def run_ingest_container(args):
on_delivery=kafka_fail_fast,
)
counts['kafka'] += 1
- # also printing to stdout when in kafka mode; could skip?
- print(json.dumps(ingest_request))
- counts['ingest_request'] += 1
- if kafka_producer != None:
+ else:
+ print(json.dumps(ingest_request))
+ if kafka_producer is not None:
kafka_producer.flush()
print(counts, file=sys.stderr)
+ if args.dry_run:
+ print("=== THIS WAS A DRY RUN ===")
+
+
+def run_ingest_container(args):
+ """
+ This command queries elasticsearch for releases from a given container (eg,
+ journal), and prepares ingest requests for them.
+
+ By default it filters to releases which don't have any fulltext files
+ archived in IA, and dumps the ingest requests as JSON.
+ """
+
+ search = _init_search(args).filter("term", in_ia=False)
+
+ # filter/query by container
+ if args.container_id:
+ search = search.filter("term", container_id=args.container_id)
+ elif args.issnl:
+ search = search.filter("term", container_issnl=args.issnl)
+ elif args.publisher:
+ search = search.query("match", publisher=args.publisher)
+ elif args.name:
+ search = search.query("match", container_name=args.name)
+ else:
+ print("You must supply at least one query/filter parameter! Eg, ISSN-L", file=sys.stderr)
+ sys.exit(-1)
+
+ return _run_search_dump(args, search)
+
+
+def run_ingest_query(args):
+ """
+ Accepts a free-form Lucene query language string. Intended to work the same
+ way as searches in the fatcat web interface.
+ """
+
+ search = _init_search(args) \
+ .filter("term", in_ia=False) \
+ .query(
+ "query_string",
+ query=args.query,
+ default_operator="AND",
+ analyze_wildcard=True,
+ lenient=True,
+ fields=["title^5", "contrib_names^2", "container_title"],
+ )
+
+ return _run_search_dump(args, search)
+
+
+def run_ingest_extid(args):
+ """
+ Selects release entities where the external identifier (extid) exists
+ """
+
+ search = _init_search(args) \
+ .filter("term", in_ia=False) \
+ .filter("exists", field=args.extid)
+
+ return _run_search_dump(args, search)
+
def main():
parser = argparse.ArgumentParser(
@@ -112,20 +187,51 @@ def main():
parser.add_argument('--env',
default="dev",
help="Kafka topic namespace to use (eg, prod, qa, dev)")
+ parser.add_argument('--limit',
+ default=None,
+ type=int,
+ help="Max number of search hits to return")
+ parser.add_argument('--dry-run',
+ action='store_true',
+ help="runs through creating all ingest requests, but doesn't actually output or enqueue")
+ parser.add_argument('--before-year',
+ type=str,
+ help="filters results to only with release_year before this (not inclusive)")
+ parser.add_argument('--after-year',
+ type=str,
+ help="filters results to only with release_year after this (inclusive)")
+ parser.add_argument('--release-types',
+ type=str,
+ help="filters results to specified release-types, separated by commas. By default, 'stub' is filtered out.")
+ parser.add_argument('--allow-non-oa',
+ action='store_true',
+ help="By default, we limit to OA releases. This removes that filter")
subparsers = parser.add_subparsers()
- sub_ingest_container = subparsers.add_parser('ingest-container',
+ sub_container = subparsers.add_parser('container',
help="Create ingest requests for releases from a specific container")
- sub_ingest_container.set_defaults(func=run_ingest_container)
- sub_ingest_container.add_argument('--container-id',
+ sub_container.set_defaults(func=run_ingest_container)
+ sub_container.add_argument('--container-id',
help="fatcat container entity ident")
- sub_ingest_container.add_argument('--issnl',
+ sub_container.add_argument('--issnl',
help="ISSN-L of container entity")
- sub_ingest_container.add_argument('--publisher',
+ sub_container.add_argument('--publisher',
help="publisher name")
- sub_ingest_container.add_argument('--name',
+ sub_container.add_argument('--name',
help="container name")
+ sub_query = subparsers.add_parser('query',
+ help="Create ingest requests for releases from a specific query")
+ sub_query.set_defaults(func=run_ingest_query)
+ sub_query.add_argument('query',
+ help="search query (same DSL as web interface search)")
+
+ sub_extid = subparsers.add_parser('extid',
+ help="Create ingest requests for releases that have given extid defined")
+ sub_extid.set_defaults(func=run_ingest_extid)
+ sub_extid.add_argument('extid',
+ help="extid short name (as included in ES release schema)")
+
args = parser.parse_args()
if not args.__dict__.get("func"):
print("tell me what to do!")
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py
index 33f44600..d2d71d3c 100644
--- a/python/fatcat_tools/harvest/doi_registrars.py
+++ b/python/fatcat_tools/harvest/doi_registrars.py
@@ -70,8 +70,8 @@ class HarvestCrossrefWorker:
def fail_fast(err, msg):
if err is not None:
- print("Kafka producer delivery error: {}".format(err))
- print("Bailing out...")
+ print("Kafka producer delivery error: {}".format(err), file=sys.stderr)
+ print("Bailing out...", file=sys.stderr)
# TODO: should it be sys.exit(-1)?
raise KafkaException(err)
@@ -117,7 +117,7 @@ class HarvestCrossrefWorker:
if http_resp.status_code == 503:
# crude backoff; now redundant with session exponential
# backoff, but allows for longer backoff/downtime on remote end
- print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code))
+ print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code), file=sys.stderr)
# keep kafka producer connection alive
self.producer.poll(0)
time.sleep(30.0)
@@ -131,7 +131,7 @@ class HarvestCrossrefWorker:
items = self.extract_items(resp)
count += len(items)
print("... got {} ({} of {}), HTTP fetch took {}".format(len(items), count,
- self.extract_total(resp), http_resp.elapsed))
+ self.extract_total(resp), http_resp.elapsed), file=sys.stderr)
#print(json.dumps(resp))
for work in items:
self.producer.produce(
@@ -156,7 +156,7 @@ class HarvestCrossrefWorker:
while True:
current = self.state.next(continuous)
if current:
- print("Fetching DOIs updated on {} (UTC)".format(current))
+ print("Fetching DOIs updated on {} (UTC)".format(current), file=sys.stderr)
self.fetch_date(current)
self.state.complete(current,
kafka_topic=self.state_topic,
@@ -164,11 +164,11 @@ class HarvestCrossrefWorker:
continue
if continuous:
- print("Sleeping {} seconds...".format(self.loop_sleep))
+ print("Sleeping {} seconds...".format(self.loop_sleep), file=sys.stderr)
time.sleep(self.loop_sleep)
else:
break
- print("{} DOI ingest caught up".format(self.name))
+ print("{} DOI ingest caught up".format(self.name), file=sys.stderr)
class HarvestDataciteWorker(HarvestCrossrefWorker):
diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py
index 78830a1c..310366bd 100644
--- a/python/fatcat_tools/harvest/harvest_common.py
+++ b/python/fatcat_tools/harvest/harvest_common.py
@@ -57,6 +57,10 @@ class HarvestState:
if catchup_days or start_date or end_date:
self.enqueue_period(start_date, end_date, catchup_days)
+ def __str__(self):
+ return '<HarvestState to_process={}, completed={}>'.format(
+ len(self.to_process), len(self.completed))
+
def enqueue_period(self, start_date=None, end_date=None, catchup_days=14):
"""
This function adds a time period to the "TODO" list, unless the dates
@@ -129,7 +133,7 @@ class HarvestState:
def fail_fast(err, msg):
if err:
raise KafkaException(err)
- print("Commiting status to Kafka: {}".format(kafka_topic))
+ print("Commiting status to Kafka: {}".format(kafka_topic), file=sys.stderr)
producer_conf = kafka_config.copy()
producer_conf.update({
'delivery.report.only.error': True,
@@ -154,7 +158,7 @@ class HarvestState:
if not kafka_topic:
return
- print("Fetching state from kafka topic: {}".format(kafka_topic))
+ print("Fetching state from kafka topic: {}".format(kafka_topic), file=sys.stderr)
def fail_fast(err, msg):
if err:
raise KafkaException(err)
@@ -191,4 +195,4 @@ class HarvestState:
# verify that we got at least to HWM
assert c >= hwm[1]
- print("... got {} state update messages, done".format(c))
+ print("... got {} state update messages, done".format(c), file=sys.stderr)
diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py
index f908ba83..11b5fa0a 100644
--- a/python/fatcat_tools/harvest/oaipmh.py
+++ b/python/fatcat_tools/harvest/oaipmh.py
@@ -49,13 +49,14 @@ class HarvestOaiPmhWorker:
self.name = "unnamed"
self.state = HarvestState(start_date, end_date)
self.state.initialize_from_kafka(self.state_topic, self.kafka_config)
+ print(self.state, file=sys.stderr)
def fetch_date(self, date):
def fail_fast(err, msg):
if err is not None:
- print("Kafka producer delivery error: {}".format(err))
- print("Bailing out...")
+ print("Kafka producer delivery error: {}".format(err), file=sys.stderr)
+ print("Bailing out...", file=sys.stderr)
# TODO: should it be sys.exit(-1)?
raise KafkaException(err)
@@ -79,14 +80,14 @@ class HarvestOaiPmhWorker:
'until': date_str,
})
except sickle.oaiexceptions.NoRecordsMatch:
- print("WARN: no OAI-PMH records for this date: {} (UTC)".format(date_str))
+ print("WARN: no OAI-PMH records for this date: {} (UTC)".format(date_str), file=sys.stderr)
return
count = 0
for item in records:
count += 1
if count % 50 == 0:
- print("... up to {}".format(count))
+ print("... up to {}".format(count), file=sys.stderr)
producer.produce(
self.produce_topic,
item.raw.encode('utf-8'),
@@ -99,7 +100,7 @@ class HarvestOaiPmhWorker:
while True:
current = self.state.next(continuous)
if current:
- print("Fetching DOIs updated on {} (UTC)".format(current))
+ print("Fetching DOIs updated on {} (UTC)".format(current), file=sys.stderr)
self.fetch_date(current)
self.state.complete(current,
kafka_topic=self.state_topic,
@@ -107,11 +108,11 @@ class HarvestOaiPmhWorker:
continue
if continuous:
- print("Sleeping {} seconds...".format(self.loop_sleep))
+ print("Sleeping {} seconds...".format(self.loop_sleep), file=sys.stderr)
time.sleep(self.loop_sleep)
else:
break
- print("{} OAI-PMH ingest caught up".format(self.name))
+ print("{} OAI-PMH ingest caught up".format(self.name), file=sys.stderr)
class HarvestArxivWorker(HarvestOaiPmhWorker):
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index d936605f..10557ef8 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -28,3 +28,4 @@ from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE
from .wayback_static import auto_wayback_static
from .cdl_dash_dat import auto_cdl_dash_dat
from .ingest import IngestFileResultImporter, SavePaperNowFileImporter
+from .shadow import ShadowLibraryImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 1ffbd6e7..a84ce90f 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -194,6 +194,8 @@ DOMAIN_REL_MAP = {
"www.scielo.cl": "repository",
"www.scielo.org.mx": "repository",
"zenodo.org": "repository",
+ "www.biorxiv.org": "repository",
+ "www.medrxiv.org": "repository",
"citeseerx.ist.psu.edu": "aggregator",
"publisher-connector.core.ac.uk": "aggregator",
@@ -220,6 +222,13 @@ DOMAIN_REL_MAP = {
"www.nature.com": "publisher",
"www.pnas.org": "publisher",
"www.tandfonline.com": "publisher",
+ "www.frontiersin.org": "publisher",
+ "www.degruyter.com": "publisher",
+ "www.mdpi.com": "publisher",
+ "www.ahajournals.org": "publisher",
+ "ehp.niehs.nih.gov": "publisher",
+ "journals.tsu.ru": "publisher",
+ "www.cogentoa.com": "publisher",
"www.researchgate.net": "academicsocial",
"academia.edu": "academicsocial",
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 2f77481a..4e382348 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -1,11 +1,11 @@
"""
Prototype importer for datacite.org data.
-Example input document at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8.
+Example input document: https://api.datacite.org/dois/10.7916/d8-f93n-rk51
-Datacite being an aggregator, the data is varied and exposes a couple of
-problems in content and structure. A few fields habe their own parsing
-functions (parse_datacite_...), which can be tested more easily.
+Datacite being an aggregator, the data is heterogenous and exposes a couple of
+problems in content and structure. A few fields have their own parsing
+functions (parse_datacite_...), which may help testing.
"""
import collections
@@ -311,6 +311,16 @@ class DataciteImporter(EntityImporter):
release_date, release_month, release_year = parse_datacite_dates(
attributes.get('dates', []))
+ # Some records do not use the "dates" field (e.g. micropub), but:
+ # "attributes.published" or "attributes.publicationYear"
+ if not any((release_date, release_month, release_year)):
+ release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear'))
+ if not any((release_date, release_month, release_year)):
+ release_date, release_month, release_year = parse_single_date(attributes.get('published'))
+
+ if not any((release_date, release_month, release_year)):
+ print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr)
+
# Start with clear stages, e.g. published. TODO(martin): we could
# probably infer a bit more from the relations, e.g.
# "IsPreviousVersionOf" or "IsNewVersionOf".
@@ -380,6 +390,11 @@ class DataciteImporter(EntityImporter):
len(container_name)))
container_name = container_name[0]
+ # Exception: https://www.micropublication.org/, see: !MR24.
+ if container_id is None and container_name is None:
+ if publisher and publisher.lower().startswith('micropublication'):
+ container_name = publisher
+
# Volume and issue.
volume = container.get('volume')
issue = container.get('issue')
@@ -490,7 +505,7 @@ class DataciteImporter(EntityImporter):
if len(text) > MAX_ABSTRACT_LENGTH:
text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
- # Detect language.
+ # Detect language. This is fuzzy and may be removed, if too unreliable.
lang = None
try:
lang = langdetect.detect(text)
@@ -719,8 +734,10 @@ class DataciteImporter(EntityImporter):
if name:
name = clean(name)
- if not name:
+ if not any((name, given_name, surname)):
continue
+ if not name:
+ name = "{} {}".format(given_name or '', surname or '').strip()
if name in name_blacklist:
continue
if name.lower() in UNKNOWN_MARKERS_LOWER:
@@ -924,6 +941,32 @@ def parse_datacite_titles(titles):
return title, original_language_title, subtitle
+def parse_single_date(value):
+ """
+ Given a single string containing a date in arbitrary format, try to return
+ tuple (date: datetime.date, month: int, year: int).
+ """
+ if not value:
+ return None, None, None
+ if isinstance(value, int):
+ value = str(value)
+ parser = dateparser.DateDataParser()
+ try:
+ # Results in a dict with keys: date_obj, period, locale.
+ parse_result = parser.get_date_data(value)
+ # A datetime object, later we need a date, only.
+ result = parse_result['date_obj']
+ if result is not None:
+ if parse_result['period'] == 'year':
+ return None, None, result.year
+ elif parse_result['period'] == 'month':
+ return None, result.month, result.year
+ else:
+ return result.date(), result.month, result.year
+ except TypeError as err:
+ print("{} date parsing failed with: {}".format(value, err), file=sys.stderr)
+
+ return None, None, None
def parse_datacite_dates(dates):
"""
@@ -966,7 +1009,7 @@ def parse_datacite_dates(dates):
)
def parse_item(item):
- result, value, year_only = None, item.get('date', ''), False
+ result, value, year_only = None, item.get('date', '') or '', False
release_date, release_month, release_year = None, None, None
for layout, granularity in common_patterns:
@@ -981,23 +1024,7 @@ def parse_datacite_dates(dates):
if result is None:
print('fallback for {}'.format(value), file=sys.stderr)
- parser = dateparser.DateDataParser()
- try:
- # Results in a dict with keys: date_obj, period, locale.
- parse_result = parser.get_date_data(value)
-
- # A datetime object, later we need a date, only.
- result = parse_result['date_obj']
- if result is not None:
- if parse_result['period'] == 'year':
- return None, None, result.year
- elif parse_result['period'] == 'month':
- return None, result.month, result.year
- else:
- return result.date(), result.month, result.year
- except TypeError as err:
- print("{} date parsing failed with: {}".format(value, err),
- file=sys.stderr)
+ release_date, release_month, release_year = parse_single_date(value)
if result is None:
# Unparsable date.
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index bdfd2835..4772bfaa 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -31,6 +31,12 @@ class IngestFileResultImporter(EntityImporter):
'fatcat-ingest-container',
'fatcat-ingest',
'arabesque',
+ 'mag-corpus',
+ 'mag',
+ 'unpaywall-corpus',
+ 'unpaywall',
+ 's2-corpus',
+ 's2',
]
if kwargs.get('skip_source_whitelist', False):
self.ingest_request_source_whitelist = []
@@ -54,11 +60,14 @@ class IngestFileResultImporter(EntityImporter):
self.counts['skip-hit'] += 1
return False
source = row['request'].get('ingest_request_source')
+ if not source:
+ self.counts['skip-ingest_request_source'] += 1
+ return False
if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist:
self.counts['skip-ingest_request_source'] += 1
return False
if source.startswith('arabesque'):
- if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi'):
+ if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2'):
self.counts['skip-arabesque-source'] += 1
return False
if source.startswith('savepapernow'):
@@ -131,7 +140,12 @@ class IngestFileResultImporter(EntityImporter):
if not 'terminal_dt' in terminal:
terminal['terminal_dt'] = terminal['dt']
assert len(terminal['terminal_dt']) == 14
- url = make_rel_url(terminal['terminal_url'], self.default_link_rel)
+
+ default_rel = self.default_link_rel
+ if request.get('link_source') == 'doi':
+ default_rel = 'publisher'
+ default_rel = request.get('rel', default_rel)
+ url = make_rel_url(terminal['terminal_url'], default_rel)
if not url:
self.counts['skip-url'] += 1
@@ -152,8 +166,8 @@ class IngestFileResultImporter(EntityImporter):
release_ids=[release_ident],
urls=urls,
)
- if fatcat and fatcat.get('edit_extra'):
- fe.edit_extra = fatcat['edit_extra']
+ if request.get('edit_extra'):
+ fe.edit_extra = request['edit_extra']
else:
fe.edit_extra = dict()
if request.get('ingest_request_source'):
@@ -229,6 +243,9 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
def want(self, row):
source = row['request'].get('ingest_request_source')
+ if not source:
+ self.counts['skip-ingest_request_source'] += 1
+ return False
if not source.startswith('savepapernow'):
self.counts['skip-not-savepapernow'] += 1
return False
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py
new file mode 100644
index 00000000..4cd22775
--- /dev/null
+++ b/python/fatcat_tools/importers/shadow.py
@@ -0,0 +1,195 @@
+
+import sys
+import json
+import sqlite3
+import itertools
+import fatcat_openapi_client
+
+from fatcat_tools.normal import *
+from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS
+
+
+class ShadowLibraryImporter(EntityImporter):
+ """
+ Importer for shadow library files (matched to releases)
+
+ Input format is JSON with keys:
+ - shadow
+ - shadow_corpus (string slug)
+ - shadow_id (string)
+ - doi
+ - pmid
+ - isbn13
+ - file_meta
+ - sha1hex
+ - sha256hex
+ - md5hex
+ - size_bytes
+ - mimetype
+ - cdx (may be null)
+ - url
+ - datetime
+ """
+
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches"
+ eg_extra = kwargs.pop('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+ self.default_link_rel = kwargs.get("default_link_rel", "web")
+
+ def want(self, raw_record):
+ """
+ Only want to import records with complete file-level metadata
+ """
+ fm = raw_record['file_meta']
+ if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']):
+ self.counts['skip-file-meta-incomplete'] += 1
+ return False
+ if fm['mimetype'] != 'application/pdf':
+ self.counts['skip-not-pdf'] += 1
+ return False
+ return True
+
+ def parse_record(self, obj):
+ """
+ We do the release lookup in this method. Try DOI, then PMID, last ISBN13.
+ """
+
+ shadow_corpus = obj['shadow']['shadow_corpus']
+ assert shadow_corpus == shadow_corpus.strip().lower()
+ doi = clean_doi(obj['shadow'].get('doi'))
+ pmid = clean_pmid(obj['shadow'].get('pmid'))
+ isbn13 = clean_isbn13(obj['shadow'].get('isbn13'))
+ shadow_id = obj['shadow'].get('shadow_id').strip()
+ assert shadow_id
+
+ extra = { '{}_id'.format(shadow_corpus): shadow_id }
+ for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+ if not ext_id:
+ continue
+ extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id
+
+ # lookup release via several idents
+ re = None
+ for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+ if not ext_id:
+ continue
+ try:
+ re = self.api.lookup_release(**{ext_type: ext_id})
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status not in (404, 400):
+ raise err
+ re = None
+ if re:
+ break
+
+ if not re:
+ self.counts['skip-release-not-found'] += 1
+ return None
+
+ release_ids = [re.ident,]
+
+ # parse single CDX into URLs (if exists)
+ urls = []
+ if obj.get('cdx'):
+ url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel)
+ if url != None:
+ urls.append(url)
+ wayback = "https://web.archive.org/web/{}/{}".format(
+ obj['cdx']['datetime'],
+ obj['cdx']['url'])
+ urls.append(("webarchive", wayback))
+ urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
+
+ fe = fatcat_openapi_client.FileEntity(
+ md5=obj['file_meta']['md5hex'],
+ sha1=obj['file_meta']['sha1hex'],
+ sha256=obj['file_meta']['sha256hex'],
+ size=int(obj['file_meta']['size_bytes']),
+ mimetype=obj['file_meta']['mimetype'] or None,
+ release_ids=release_ids,
+ urls=urls,
+ extra=dict(shadows=extra),
+ )
+ return fe
+
+ def try_update(self, fe):
+ # lookup sha1, or create new entity
+ existing = None
+ try:
+ existing = self.api.lookup_file(sha1=fe.sha1)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if not existing:
+ return True
+
+ if not existing.extra:
+ existing.extra = {}
+
+ if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']:
+ # already imported from this shadow library; skip
+ self.counts['exists'] += 1
+ return False
+
+ # check for edit conflicts
+ if existing.ident in [e.ident for e in self._edits_inflight]:
+ self.counts['skip-update-inflight'] += 1
+ return False
+ if fe.sha1 in [e.sha1 for e in self._edits_inflight]:
+ raise Exception("Inflight insert; shouldn't happen")
+
+ # minimum viable "existing" URL cleanup to fix dupes and broken links:
+ # remove 'None' wayback URLs, and set archive.org rel 'archive'
+ existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
+ for i in range(len(existing.urls)):
+ u = existing.urls[i]
+ if u.rel == 'repository' and '://archive.org/download/' in u.url:
+ existing.urls[i].rel = 'archive'
+ if u.rel == 'social':
+ u.rel = 'academicsocial'
+
+ # merge the existing into this one and update
+ merged_urls = {}
+ for u in fe.urls + existing.urls:
+ merged_urls[u.url] = u
+ existing.urls = list(merged_urls.values())
+ if not existing.extra.get('shadows'):
+ existing.extra['shadows'] = fe.extra['shadows']
+ else:
+ existing.extra['shadows'].update(fe.extra['shadows'])
+
+ # do these "plus ones" because we really want to do these updates when possible
+ if len(existing.urls) > SANE_MAX_URLS + 1:
+ self.counts['skip-update-too-many-url'] += 1
+ return None
+ existing.release_ids = list(set(fe.release_ids + existing.release_ids))
+ if len(existing.release_ids) > SANE_MAX_RELEASES + 1:
+ self.counts['skip-update-too-many-releases'] += 1
+ return None
+ existing.mimetype = existing.mimetype or fe.mimetype
+ existing.size = existing.size or fe.size
+ existing.md5 = existing.md5 or fe.md5
+ existing.sha1 = existing.sha1 or fe.sha1
+ existing.sha256 = existing.sha256 or fe.sha256
+ edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
+ # add sha1 to non-entity edit row, so we can do more aggressive
+ # group-level de-dupe
+ edit.sha1 = existing.sha1
+ self._edits_inflight.append(edit)
+ self.counts['update'] += 1
+ return False
+
+ def insert_batch(self, batch):
+ self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
+
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py
index 27a4fb93..22b5154e 100644
--- a/python/fatcat_tools/transforms/ingest.py
+++ b/python/fatcat_tools/transforms/ingest.py
@@ -23,16 +23,16 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=
url = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv)
link_source = "arxiv"
link_source_id = release.ext_ids.arxiv
- elif release.ext_ids.doi:
- url = "https://doi.org/{}".format(release.ext_ids.doi)
- link_source = "doi"
- link_source_id = release.ext_ids.doi
elif release.ext_ids.pmcid:
# TODO: how to tell if an author manuscript in PMC vs. published?
#url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid)
url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid)
link_source = "pmc"
link_source_id = release.ext_ids.pmcid
+ elif release.ext_ids.doi:
+ url = "https://doi.org/{}".format(release.ext_ids.doi)
+ link_source = "doi"
+ link_source_id = release.ext_ids.doi
if not url:
return None
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index c8584ccf..b84d5e70 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -102,6 +102,12 @@ class EntityUpdatesWorker(FatcatWorker):
# ccdc.cam.ac.uk: crystal structures
"10.5517/",
]
+ self.live_pdf_ingest_doi_prefix_acceptlist = [
+ # biorxiv and medrxiv
+ "10.1101/",
+ # researchgate
+ "10.13140/",
+ ]
def want_live_ingest(self, release, ingest_request):
"""
@@ -115,13 +121,55 @@ class EntityUpdatesWorker(FatcatWorker):
link_source = ingest_request.get('ingest_request')
ingest_type = ingest_request.get('ingest_type')
+ doi = ingest_request.get('ext_ids', {}).get('doi')
+
+ is_document = release.release_type in (
+ 'article-journal',
+ 'paper-conference',
+ 'article',
+ 'report',
+ 'chapter',
+ 'manuscript',
+ 'review',
+ 'thesis',
+ 'letter',
+ 'editorial',
+ 'abstract',
+ 'entry',
+ 'patent',
+ 'post',
+ 'review-book',
+ )
+ is_not_pdf = release.release_type in (
+ 'dataset',
+ 'stub',
+ 'software',
+ 'figure',
+ 'graphic',
+ )
+
+ # accept list sets a default "crawl it" despite OA metadata for
+ # known-OA DOI prefixes
+ in_acceptlist = False
+ if doi:
+ for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
+ if doi.startswith(prefix):
+ in_acceptlist = True
if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
es = release_to_elasticsearch(release)
- if not es['is_oa']:
+ # most datacite documents are in IRs and should be crawled
+ is_datacite_doc = False
+ if release.extra and ('datacite' in release.extra) and is_document:
+ is_datacite_doc = True
+ if not (es['is_oa'] or in_acceptlist or is_datacite_doc):
return False
- doi = ingest_request.get('ext_ids', {}).get('doi')
+ # if ingest_type is pdf but release_type is almost certainly not a PDF,
+ # skip it. This is mostly a datacite thing.
+ if ingest_type == "pdf" and is_not_pdf:
+ return False
+
if ingest_type == "pdf" and doi:
for prefix in self.ingest_pdf_doi_prefix_blocklist:
if doi.startswith(prefix):
diff --git a/python/fatcat_web/entity_helpers.py b/python/fatcat_web/entity_helpers.py
index af0fea83..591dda80 100644
--- a/python/fatcat_web/entity_helpers.py
+++ b/python/fatcat_web/entity_helpers.py
@@ -53,6 +53,10 @@ def enrich_release_entity(entity):
entity._es = release_to_elasticsearch(entity, force_bool=False)
if entity.container and entity.container.state == "active":
entity.container._es = container_to_elasticsearch(entity.container, force_bool=False)
+ if entity.files:
+ # remove shadows-only files with no URLs
+ entity.files = [f for f in entity.files
+ if not (f.extra and f.extra.get('shadows') and not f.urls)]
if entity.filesets:
for fs in entity.filesets:
fs._total_size = sum([f.size for f in fs.manifest])
diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html
index 83ecd1c8..961b4759 100644
--- a/python/fatcat_web/templates/release_view.html
+++ b/python/fatcat_web/templates/release_view.html
@@ -196,8 +196,9 @@
</tbody>
</table>
{% else %}
-<p>There are no known files associated with this release (you could try
-<a href="/work/{{ release.work_id }}">other releases for this work?</a>).
+<p>There are no accessible files associated with this release. You could check
+<a href="/work/{{ release.work_id }}">other releases for this work</a> for an
+accessible version.
{% endif %}
{% endif %}
diff --git a/python/tests/files/datacite/datacite_doc_30.json b/python/tests/files/datacite/datacite_doc_30.json
new file mode 100644
index 00000000..5f851bbb
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_30.json
@@ -0,0 +1,72 @@
+{
+ "id": "10.17912/micropub.biology.000143",
+ "type": "dois",
+ "attributes": {
+ "doi": "10.17912/micropub.biology.000143",
+ "identifiers": null,
+ "creators": [
+ {
+ "raw_name": "Celja J Uebel",
+ "givenName": "Celja J",
+ "familyName": "Uebel",
+ "affiliation": [],
+ "role": "author"
+ },
+ {
+ "raw_name": "Carolyn M Phillips",
+ "givenName": "Carolyn M",
+ "familyName": "Phillips",
+ "affiliation": [],
+ "role": "author"
+ }
+ ],
+ "titles": [
+ {
+ "title": "Phase-separated protein dynamics are affected by fluorescent tag choice"
+ }
+ ],
+ "publisher": "microPublication Biology",
+ "container": {},
+ "publicationYear": 2019,
+ "subjects": [],
+ "contributors": [],
+ "dates": null,
+ "language": null,
+ "types": {
+ "resourceTypeGeneral": "DataPaper"
+ },
+ "relatedIdentifiers": [],
+ "sizes": [],
+ "formats": [],
+ "version": null,
+ "rightsList": [],
+ "descriptions": [
+ {
+ "description": "Biological liquid-liquid phase separation",
+ "descriptionType": "Abstract"
+ }
+ ],
+ "geoLocations": [],
+ "fundingReferences": [],
+ "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143",
+ "contentUrl": null,
+ "metadataVersion": 0,
+ "schemaVersion": null,
+ "source": null,
+ "isActive": true,
+ "state": "findable",
+ "reason": null,
+ "created": "2019-08-19T14:43:08.000Z",
+ "registered": "2019-08-19T14:43:09.000Z",
+ "published": "2019",
+ "updated": "2019-11-09T12:32:02.000Z"
+ },
+ "relationships": {
+ "client": {
+ "data": {
+ "id": "caltech.micropub",
+ "type": "clients"
+ }
+ }
+ }
+}
diff --git a/python/tests/files/datacite/datacite_result_30.json b/python/tests/files/datacite/datacite_result_30.json
new file mode 100644
index 00000000..fc2c4dfc
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_30.json
@@ -0,0 +1,39 @@
+{
+ "abstracts": [
+ {
+ "content": "Biological liquid-liquid phase separation",
+ "lang": "fr",
+ "mimetype": "text/plain"
+ }
+ ],
+ "contribs": [
+ {
+ "index": 0,
+ "given_name": "Celja J",
+ "surname": "Uebel",
+ "raw_name": "Celja J Uebel",
+ "role": "author"
+ },
+ {
+ "index": 1,
+ "given_name": "Carolyn M",
+ "raw_name": "Carolyn M Phillips",
+ "surname": "Phillips",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.17912/micropub.biology.000143"
+ },
+ "extra": {
+ "datacite": {
+ "resourceTypeGeneral": "DataPaper"
+ },
+ "container_name": "microPublication Biology"
+ },
+ "refs": [],
+ "release_stage": "published",
+ "release_year": 2019,
+ "publisher": "microPublication Biology",
+ "title": "Phase-separated protein dynamics are affected by fluorescent tag choice"
+}
diff --git a/python/tests/files/example_shadow.json b/python/tests/files/example_shadow.json
new file mode 100644
index 00000000..3386f481
--- /dev/null
+++ b/python/tests/files/example_shadow.json
@@ -0,0 +1,10 @@
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"12703034","sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","doi":"10.1371/journal.pmed.0020124","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","sha256hex":"b4728210cc0f70d8a8f8c39bd97fcbbab3eaca4309ac4bdfbce5df3b66c82f79","md5hex":"debd8db178fa08a7a0aaec6e42832a8e","size_bytes":206121,"mimetype":"application/pdf"},"cdx":{"url":"https://link.springer.com/content/pdf/10.1007%2Fs11626-008-9119-8.pdf","datetime":"20180729135948","sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"UNPAYWALL-PDF-CRAWL-2018-07-20180729132538992-15980-16048-wbgrp-svc281/UNPAYWALL-PDF-CRAWL-2018-07-20180729135708800-16009-11693~wbgrp-svc281.us.archive.org~8443.warc.gz","warc_csize":32497,"warc_offset":105265425,"row_created":"2019-08-09T23:25:44.571943+00:00"}}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"51052483","sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","doi":"10.1191/0266355403gh289oa","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","sha256hex":"57ce460db4410b9bfaf500ed652fd29e64d46b40c17e28f1156ba03736edf91b","md5hex":"96133eec3a6c533993213e7bdf446251","size_bytes":164344,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"2476283","sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","doi":"10.1016/0042-207x(62)90512-2","pmid":"54321","isbn13":null},"file_meta":{"sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","sha256hex":"e8d0c607b024ff6ffd58a35f76c454844b70ad19fe3f78a573af1ae53f53ad9d","md5hex":"b53318522b9f35a42b7e53f150fe70b2","size_bytes":116735,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"8760871","sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","doi":"10.1016/s0042-207x(79)80945-8","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","sha256hex":"8a69b4a6dff98682ad43e7d4139221c1557c1bd202b615490af8a2c7dcbb71d2","md5hex":"29e1cfac8ecfbc8be57a1ec8b465c4be","size_bytes":138218,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"11473618","sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","doi":"10.1038/ng.2339","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","sha256hex":"a72517e8e72d78bc07a6ef7ff3a6d1d3e04325df986cb8f1bbb4e809f7a9dbdd","md5hex":"9cb8a6e056c9cc740d3bed0c50cd53dc","size_bytes":80992,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"47301218","sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","doi":"10.2307/23406551","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","sha256hex":"315f1d39a00ccf256fa15d92a14869dbda48d31500989aaacb11368f906a5827","md5hex":"8141b42ec3bb41fa87099633a1b61d93","size_bytes":305236,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"30603850","sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","doi":"10.1109/spire.1998.712983","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","sha256hex":"777e2c472e9d2fec3bbd26bad788562cf1e08e5850315c25cfb6e46d38e7e4af","md5hex":"3a3c92fabaf6cf437bb596d9e9255ff6","size_bytes":113768,"mimetype":"application/pdf"},"cdx":{"url":"http://proteomics.bioprojects.org/pavel/papers/SST_versus_EST_in_gene_recognition..pdf","datetime":"20081121222143","sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"1227992340180_31-c/1227992509265_9.arc.gz","warc_csize":61212,"warc_offset":62956683,"row_created":"2020-01-07T02:06:33.965383+00:00"}}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"9311918","sha1hex":"000002d4f7d4174451e4214475d5ba59f1f6a593","doi":"10.1111/j.1439-0507.2008.01572.x","pmid":"18721331","isbn13":null},"file_meta":{"sha1hex":"000002d4f7d4174451e4214475d5ba59f1f6a593","sha256hex":"713758ce0417f604c0a4b0bf5b5eea571a9b08ca4cc81a98d602c43f42abfe37","md5hex":"0df123e6305c617ffd38ebef90b1e318","size_bytes":178664,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"7757772","sha1hex":"000002f8966a4c5547f8a47f43661fcc3edc34ea","doi":"10.1007/s10464-011-9424-3","pmid":"21287262","isbn13":null},"file_meta":{"sha1hex":"000002f8966a4c5547f8a47f43661fcc3edc34ea","sha256hex":"ee1bce27134ae55b3d67f9b31f66571e41ac496fc3fb526dec2d53513b8f6deb","md5hex":"e72c5cf3d61635821e78ca0306c98887","size_bytes":337857,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"74272862","sha1hex":"000003a94022be58305ccc2a018a6359eeb226db","doi":"10.1002/slct.201802783","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000003a94022be58305ccc2a018a6359eeb226db","sha256hex":"f277eefc7b1466df814a7a892ab8e2e7f08db1faae0bf73b893211e5f5b37193","md5hex":"27534b8494f54ba5de47c16fb2590b04","size_bytes":1372272,"mimetype":"application/pdf"},"cdx":null}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 669a6984..15650375 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -287,10 +287,9 @@ def test_datacite_conversions(datacite_importer):
for now.
"""
datacite_importer.debug = True
- for i in range(30):
+ for i in range(31):
src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
- print('testing mapping from {} => {}'.format(src, dst))
with open(src, 'r') as f:
re = datacite_importer.parse_record(json.load(f))
result = entity_to_dict(re)
diff --git a/python/tests/import_shadow.py b/python/tests/import_shadow.py
new file mode 100644
index 00000000..70a918d2
--- /dev/null
+++ b/python/tests/import_shadow.py
@@ -0,0 +1,61 @@
+
+import json
+import pytest
+from fatcat_tools.importers import ShadowLibraryImporter, JsonLinePusher
+from fixtures import api
+
+
+@pytest.fixture(scope="function")
+def shadow_importer(api):
+ yield ShadowLibraryImporter(api)
+
+# TODO: use API to check that entities actually created...
+def test_shadow_importer_basic(shadow_importer):
+ with open('tests/files/example_shadow.json', 'r') as f:
+ JsonLinePusher(shadow_importer, f).run()
+
+def test_shadow_importer(shadow_importer):
+ last_index = shadow_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/example_shadow.json', 'r') as f:
+ shadow_importer.bezerk_mode = True
+ counts = JsonLinePusher(shadow_importer, f).run()
+ assert counts['insert'] == 2
+ assert counts['exists'] == 0
+ assert counts['skip'] == 8
+
+ # fetch most recent editgroup
+ change = shadow_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
+ assert eg.description
+ assert "shadow library" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.ShadowLibraryImporter" in eg.extra['agent']
+
+ # re-insert; should skip
+ with open('tests/files/example_shadow.json', 'r') as f:
+ shadow_importer.reset()
+ shadow_importer.bezerk_mode = False
+ counts = JsonLinePusher(shadow_importer, f).run()
+ assert counts['insert'] == 0
+ assert counts['exists'] == 2
+ assert counts['skip'] == 8
+
+def test_shadow_dict_parse(shadow_importer):
+ with open('tests/files/example_shadow.json', 'r') as f:
+ raw = json.loads(f.readline())
+ f = shadow_importer.parse_record(raw)
+
+ assert f.sha1 == "0000002922264275f11cca7b1c3fb662070d0dd7"
+ assert f.md5 == "debd8db178fa08a7a0aaec6e42832a8e"
+ assert f.sha256 == "b4728210cc0f70d8a8f8c39bd97fcbbab3eaca4309ac4bdfbce5df3b66c82f79"
+ assert f.mimetype == "application/pdf"
+ assert f.size == 206121
+ assert len(f.urls) == 2
+ for u in f.urls:
+ if u.rel == "publisher":
+ assert u.url.startswith("https://link.springer.com/content/pdf/10.1007%2Fs11626-008-9119-8.pdf")
+ if u.rel == "webarchive":
+ assert u.url.startswith("https://web.archive.org/")
+ assert "20180729135948" in u.url
+ assert len(f.release_ids) == 1
+