summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/common.py2
-rw-r--r--python/fatcat_tools/importers/dblp_release.py11
-rw-r--r--python/fatcat_tools/importers/ingest.py6
-rw-r--r--python/fatcat_tools/normal.py4
4 files changed, 19 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 2446cdbf..fcbe9ad2 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -164,7 +164,7 @@ class EntityImporter:
self.es_client = kwargs.get('es_client')
if not self.es_client:
- self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki")
+ self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki", timeout=120)
self._issnl_id_map = dict()
self._orcid_id_map = dict()
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index 5cbc95d0..daecd765 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -323,6 +323,15 @@ class DblpReleaseImporter(EntityImporter):
if err.status != 404:
raise err
+ # Just skip all releases with an arxiv_id for now. Have not decided
+ # what to do about grouping works and lookup of un-versioned arxiv_id
+ # yet. Note that this means we will lack coverage of some works which
+ # have an arxiv preprint, but in those cases we will presumably at
+ # least have the pre-print copy/record.
+ if re.ext_ids.arxiv:
+ self.counts["skip-arxiv"] += 1
+ return False
+
# then try other ext_id lookups
if not existing:
for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'):
@@ -361,7 +370,7 @@ class DblpReleaseImporter(EntityImporter):
return False
# logic for whether to do update or skip
- if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv_id:
+ if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv:
self.counts['skip-update'] += 1
return False
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index cd3d53f6..04ff8db6 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -86,7 +86,7 @@ class IngestFileResultImporter(EntityImporter):
self.counts['skip-ingest_request_source'] += 1
return False
- if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj'):
+ if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj', 'dblp'):
self.counts['skip-link-source'] += 1
return False
@@ -437,7 +437,9 @@ class IngestWebResultImporter(IngestFileResultImporter):
if 'revisit_cdx' in row:
terminal_cdx = row['revisit_cdx']
assert terminal_cdx['surt']
- assert terminal_cdx['url'] == terminal['terminal_url']
+ if terminal_cdx['url'] != terminal['terminal_url']:
+ self.counts['skip-terminal-url-mismatch'] += 1
+ return None
wc_cdx = []
# primary resource first
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index d792979d..4218856c 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -94,6 +94,9 @@ def clean_arxiv_id(raw):
- 'arxiv:' prefix
Works with versioned or un-versioned arxiv identifiers.
+
+ TODO: version of this function that only works with versioned identifiers?
+ That is the behavior of fatcat API
"""
if not raw:
return None
@@ -116,6 +119,7 @@ def test_clean_arxiv_id():
assert clean_arxiv_id("math.CA/0611800v2") == "math.CA/0611800v2"
assert clean_arxiv_id("math.CA/0611800") == "math.CA/0611800"
assert clean_arxiv_id("0806.2878v1 ") == "0806.2878v1"
+ assert clean_arxiv_id("cs/0207047") == "cs/0207047"
assert clean_arxiv_id("https://arxiv.org/abs/0806.2878v1") == "0806.2878v1"
assert clean_arxiv_id("arxiv:0806.2878v1") == "0806.2878v1"