summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/__init__.py1
-rw-r--r--python/fatcat_tools/importers/common.py9
-rw-r--r--python/fatcat_tools/importers/datacite.py75
-rw-r--r--python/fatcat_tools/importers/ingest.py25
-rw-r--r--python/fatcat_tools/importers/shadow.py195
5 files changed, 277 insertions, 28 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index d936605f..10557ef8 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -28,3 +28,4 @@ from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE
from .wayback_static import auto_wayback_static
from .cdl_dash_dat import auto_cdl_dash_dat
from .ingest import IngestFileResultImporter, SavePaperNowFileImporter
+from .shadow import ShadowLibraryImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 1ffbd6e7..a84ce90f 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -194,6 +194,8 @@ DOMAIN_REL_MAP = {
"www.scielo.cl": "repository",
"www.scielo.org.mx": "repository",
"zenodo.org": "repository",
+ "www.biorxiv.org": "repository",
+ "www.medrxiv.org": "repository",
"citeseerx.ist.psu.edu": "aggregator",
"publisher-connector.core.ac.uk": "aggregator",
@@ -220,6 +222,13 @@ DOMAIN_REL_MAP = {
"www.nature.com": "publisher",
"www.pnas.org": "publisher",
"www.tandfonline.com": "publisher",
+ "www.frontiersin.org": "publisher",
+ "www.degruyter.com": "publisher",
+ "www.mdpi.com": "publisher",
+ "www.ahajournals.org": "publisher",
+ "ehp.niehs.nih.gov": "publisher",
+ "journals.tsu.ru": "publisher",
+ "www.cogentoa.com": "publisher",
"www.researchgate.net": "academicsocial",
"academia.edu": "academicsocial",
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 2f77481a..4e382348 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -1,11 +1,11 @@
"""
Prototype importer for datacite.org data.
-Example input document at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8.
+Example input document: https://api.datacite.org/dois/10.7916/d8-f93n-rk51
-Datacite being an aggregator, the data is varied and exposes a couple of
-problems in content and structure. A few fields habe their own parsing
-functions (parse_datacite_...), which can be tested more easily.
+Datacite being an aggregator, the data is heterogenous and exposes a couple of
+problems in content and structure. A few fields have their own parsing
+functions (parse_datacite_...), which may help testing.
"""
import collections
@@ -311,6 +311,16 @@ class DataciteImporter(EntityImporter):
release_date, release_month, release_year = parse_datacite_dates(
attributes.get('dates', []))
+ # Some records do not use the "dates" field (e.g. micropub), but:
+ # "attributes.published" or "attributes.publicationYear"
+ if not any((release_date, release_month, release_year)):
+ release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear'))
+ if not any((release_date, release_month, release_year)):
+ release_date, release_month, release_year = parse_single_date(attributes.get('published'))
+
+ if not any((release_date, release_month, release_year)):
+ print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr)
+
# Start with clear stages, e.g. published. TODO(martin): we could
# probably infer a bit more from the relations, e.g.
# "IsPreviousVersionOf" or "IsNewVersionOf".
@@ -380,6 +390,11 @@ class DataciteImporter(EntityImporter):
len(container_name)))
container_name = container_name[0]
+ # Exception: https://www.micropublication.org/, see: !MR24.
+ if container_id is None and container_name is None:
+ if publisher and publisher.lower().startswith('micropublication'):
+ container_name = publisher
+
# Volume and issue.
volume = container.get('volume')
issue = container.get('issue')
@@ -490,7 +505,7 @@ class DataciteImporter(EntityImporter):
if len(text) > MAX_ABSTRACT_LENGTH:
text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
- # Detect language.
+ # Detect language. This is fuzzy and may be removed, if too unreliable.
lang = None
try:
lang = langdetect.detect(text)
@@ -719,8 +734,10 @@ class DataciteImporter(EntityImporter):
if name:
name = clean(name)
- if not name:
+ if not any((name, given_name, surname)):
continue
+ if not name:
+ name = "{} {}".format(given_name or '', surname or '').strip()
if name in name_blacklist:
continue
if name.lower() in UNKNOWN_MARKERS_LOWER:
@@ -924,6 +941,32 @@ def parse_datacite_titles(titles):
return title, original_language_title, subtitle
+def parse_single_date(value):
+ """
+ Given a single string containing a date in arbitrary format, try to return
+ tuple (date: datetime.date, month: int, year: int).
+ """
+ if not value:
+ return None, None, None
+ if isinstance(value, int):
+ value = str(value)
+ parser = dateparser.DateDataParser()
+ try:
+ # Results in a dict with keys: date_obj, period, locale.
+ parse_result = parser.get_date_data(value)
+ # A datetime object, later we need a date, only.
+ result = parse_result['date_obj']
+ if result is not None:
+ if parse_result['period'] == 'year':
+ return None, None, result.year
+ elif parse_result['period'] == 'month':
+ return None, result.month, result.year
+ else:
+ return result.date(), result.month, result.year
+ except TypeError as err:
+ print("{} date parsing failed with: {}".format(value, err), file=sys.stderr)
+
+ return None, None, None
def parse_datacite_dates(dates):
"""
@@ -966,7 +1009,7 @@ def parse_datacite_dates(dates):
)
def parse_item(item):
- result, value, year_only = None, item.get('date', ''), False
+ result, value, year_only = None, item.get('date', '') or '', False
release_date, release_month, release_year = None, None, None
for layout, granularity in common_patterns:
@@ -981,23 +1024,7 @@ def parse_datacite_dates(dates):
if result is None:
print('fallback for {}'.format(value), file=sys.stderr)
- parser = dateparser.DateDataParser()
- try:
- # Results in a dict with keys: date_obj, period, locale.
- parse_result = parser.get_date_data(value)
-
- # A datetime object, later we need a date, only.
- result = parse_result['date_obj']
- if result is not None:
- if parse_result['period'] == 'year':
- return None, None, result.year
- elif parse_result['period'] == 'month':
- return None, result.month, result.year
- else:
- return result.date(), result.month, result.year
- except TypeError as err:
- print("{} date parsing failed with: {}".format(value, err),
- file=sys.stderr)
+ release_date, release_month, release_year = parse_single_date(value)
if result is None:
# Unparsable date.
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index bdfd2835..4772bfaa 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -31,6 +31,12 @@ class IngestFileResultImporter(EntityImporter):
'fatcat-ingest-container',
'fatcat-ingest',
'arabesque',
+ 'mag-corpus',
+ 'mag',
+ 'unpaywall-corpus',
+ 'unpaywall',
+ 's2-corpus',
+ 's2',
]
if kwargs.get('skip_source_whitelist', False):
self.ingest_request_source_whitelist = []
@@ -54,11 +60,14 @@ class IngestFileResultImporter(EntityImporter):
self.counts['skip-hit'] += 1
return False
source = row['request'].get('ingest_request_source')
+ if not source:
+ self.counts['skip-ingest_request_source'] += 1
+ return False
if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist:
self.counts['skip-ingest_request_source'] += 1
return False
if source.startswith('arabesque'):
- if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi'):
+ if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2'):
self.counts['skip-arabesque-source'] += 1
return False
if source.startswith('savepapernow'):
@@ -131,7 +140,12 @@ class IngestFileResultImporter(EntityImporter):
if not 'terminal_dt' in terminal:
terminal['terminal_dt'] = terminal['dt']
assert len(terminal['terminal_dt']) == 14
- url = make_rel_url(terminal['terminal_url'], self.default_link_rel)
+
+ default_rel = self.default_link_rel
+ if request.get('link_source') == 'doi':
+ default_rel = 'publisher'
+ default_rel = request.get('rel', default_rel)
+ url = make_rel_url(terminal['terminal_url'], default_rel)
if not url:
self.counts['skip-url'] += 1
@@ -152,8 +166,8 @@ class IngestFileResultImporter(EntityImporter):
release_ids=[release_ident],
urls=urls,
)
- if fatcat and fatcat.get('edit_extra'):
- fe.edit_extra = fatcat['edit_extra']
+ if request.get('edit_extra'):
+ fe.edit_extra = request['edit_extra']
else:
fe.edit_extra = dict()
if request.get('ingest_request_source'):
@@ -229,6 +243,9 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
def want(self, row):
source = row['request'].get('ingest_request_source')
+ if not source:
+ self.counts['skip-ingest_request_source'] += 1
+ return False
if not source.startswith('savepapernow'):
self.counts['skip-not-savepapernow'] += 1
return False
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py
new file mode 100644
index 00000000..4cd22775
--- /dev/null
+++ b/python/fatcat_tools/importers/shadow.py
@@ -0,0 +1,195 @@
+
+import sys
+import json
+import sqlite3
+import itertools
+import fatcat_openapi_client
+
+from fatcat_tools.normal import *
+from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS
+
+
+class ShadowLibraryImporter(EntityImporter):
+ """
+ Importer for shadow library files (matched to releases)
+
+ Input format is JSON with keys:
+ - shadow
+ - shadow_corpus (string slug)
+ - shadow_id (string)
+ - doi
+ - pmid
+ - isbn13
+ - file_meta
+ - sha1hex
+ - sha256hex
+ - md5hex
+ - size_bytes
+ - mimetype
+ - cdx (may be null)
+ - url
+ - datetime
+ """
+
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches"
+ eg_extra = kwargs.pop('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+ self.default_link_rel = kwargs.get("default_link_rel", "web")
+
+ def want(self, raw_record):
+ """
+ Only want to import records with complete file-level metadata
+ """
+ fm = raw_record['file_meta']
+ if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']):
+ self.counts['skip-file-meta-incomplete'] += 1
+ return False
+ if fm['mimetype'] != 'application/pdf':
+ self.counts['skip-not-pdf'] += 1
+ return False
+ return True
+
+ def parse_record(self, obj):
+ """
+ We do the release lookup in this method. Try DOI, then PMID, last ISBN13.
+ """
+
+ shadow_corpus = obj['shadow']['shadow_corpus']
+ assert shadow_corpus == shadow_corpus.strip().lower()
+ doi = clean_doi(obj['shadow'].get('doi'))
+ pmid = clean_pmid(obj['shadow'].get('pmid'))
+ isbn13 = clean_isbn13(obj['shadow'].get('isbn13'))
+ shadow_id = obj['shadow'].get('shadow_id').strip()
+ assert shadow_id
+
+ extra = { '{}_id'.format(shadow_corpus): shadow_id }
+ for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+ if not ext_id:
+ continue
+ extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id
+
+ # lookup release via several idents
+ re = None
+ for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+ if not ext_id:
+ continue
+ try:
+ re = self.api.lookup_release(**{ext_type: ext_id})
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status not in (404, 400):
+ raise err
+ re = None
+ if re:
+ break
+
+ if not re:
+ self.counts['skip-release-not-found'] += 1
+ return None
+
+ release_ids = [re.ident,]
+
+ # parse single CDX into URLs (if exists)
+ urls = []
+ if obj.get('cdx'):
+ url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel)
+ if url != None:
+ urls.append(url)
+ wayback = "https://web.archive.org/web/{}/{}".format(
+ obj['cdx']['datetime'],
+ obj['cdx']['url'])
+ urls.append(("webarchive", wayback))
+ urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
+
+ fe = fatcat_openapi_client.FileEntity(
+ md5=obj['file_meta']['md5hex'],
+ sha1=obj['file_meta']['sha1hex'],
+ sha256=obj['file_meta']['sha256hex'],
+ size=int(obj['file_meta']['size_bytes']),
+ mimetype=obj['file_meta']['mimetype'] or None,
+ release_ids=release_ids,
+ urls=urls,
+ extra=dict(shadows=extra),
+ )
+ return fe
+
+ def try_update(self, fe):
+ # lookup sha1, or create new entity
+ existing = None
+ try:
+ existing = self.api.lookup_file(sha1=fe.sha1)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if not existing:
+ return True
+
+ if not existing.extra:
+ existing.extra = {}
+
+ if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']:
+ # already imported from this shadow library; skip
+ self.counts['exists'] += 1
+ return False
+
+ # check for edit conflicts
+ if existing.ident in [e.ident for e in self._edits_inflight]:
+ self.counts['skip-update-inflight'] += 1
+ return False
+ if fe.sha1 in [e.sha1 for e in self._edits_inflight]:
+ raise Exception("Inflight insert; shouldn't happen")
+
+ # minimum viable "existing" URL cleanup to fix dupes and broken links:
+ # remove 'None' wayback URLs, and set archive.org rel 'archive'
+ existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
+ for i in range(len(existing.urls)):
+ u = existing.urls[i]
+ if u.rel == 'repository' and '://archive.org/download/' in u.url:
+ existing.urls[i].rel = 'archive'
+ if u.rel == 'social':
+ u.rel = 'academicsocial'
+
+ # merge the existing into this one and update
+ merged_urls = {}
+ for u in fe.urls + existing.urls:
+ merged_urls[u.url] = u
+ existing.urls = list(merged_urls.values())
+ if not existing.extra.get('shadows'):
+ existing.extra['shadows'] = fe.extra['shadows']
+ else:
+ existing.extra['shadows'].update(fe.extra['shadows'])
+
+ # do these "plus ones" because we really want to do these updates when possible
+ if len(existing.urls) > SANE_MAX_URLS + 1:
+ self.counts['skip-update-too-many-url'] += 1
+ return None
+ existing.release_ids = list(set(fe.release_ids + existing.release_ids))
+ if len(existing.release_ids) > SANE_MAX_RELEASES + 1:
+ self.counts['skip-update-too-many-releases'] += 1
+ return None
+ existing.mimetype = existing.mimetype or fe.mimetype
+ existing.size = existing.size or fe.size
+ existing.md5 = existing.md5 or fe.md5
+ existing.sha1 = existing.sha1 or fe.sha1
+ existing.sha256 = existing.sha256 or fe.sha256
+ edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
+ # add sha1 to non-entity edit row, so we can do more aggressive
+ # group-level de-dupe
+ edit.sha1 = existing.sha1
+ self._edits_inflight.append(edit)
+ self.counts['update'] += 1
+ return False
+
+ def insert_batch(self, batch):
+ self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
+