diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/importers/common.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/importers/dblp_release.py | 11 | ||||
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 6 | ||||
-rw-r--r-- | python/fatcat_tools/normal.py | 4 |
4 files changed, 19 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 2446cdbf..fcbe9ad2 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -164,7 +164,7 @@ class EntityImporter: self.es_client = kwargs.get('es_client') if not self.es_client: - self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki") + self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki", timeout=120) self._issnl_id_map = dict() self._orcid_id_map = dict() diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index 5cbc95d0..daecd765 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -323,6 +323,15 @@ class DblpReleaseImporter(EntityImporter): if err.status != 404: raise err + # Just skip all releases with an arxiv_id for now. Have not decided + # what to do about grouping works and lookup of un-versioned arxiv_id + # yet. Note that this means we will lack coverage of some works which + # have an arxiv preprint, but in those cases we will presumably at + # least have the pre-print copy/record. + if re.ext_ids.arxiv: + self.counts["skip-arxiv"] += 1 + return False + # then try other ext_id lookups if not existing: for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'): @@ -361,7 +370,7 @@ class DblpReleaseImporter(EntityImporter): return False # logic for whether to do update or skip - if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv_id: + if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv: self.counts['skip-update'] += 1 return False diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index cd3d53f6..04ff8db6 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -86,7 +86,7 @@ class IngestFileResultImporter(EntityImporter): self.counts['skip-ingest_request_source'] += 1 return False - if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj'): + if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj', 'dblp'): self.counts['skip-link-source'] += 1 return False @@ -437,7 +437,9 @@ class IngestWebResultImporter(IngestFileResultImporter): if 'revisit_cdx' in row: terminal_cdx = row['revisit_cdx'] assert terminal_cdx['surt'] - assert terminal_cdx['url'] == terminal['terminal_url'] + if terminal_cdx['url'] != terminal['terminal_url']: + self.counts['skip-terminal-url-mismatch'] += 1 + return None wc_cdx = [] # primary resource first diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index d792979d..4218856c 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -94,6 +94,9 @@ def clean_arxiv_id(raw): - 'arxiv:' prefix Works with versioned or un-versioned arxiv identifiers. + + TODO: version of this function that only works with versioned identifiers? + That is the behavior of fatcat API """ if not raw: return None @@ -116,6 +119,7 @@ def test_clean_arxiv_id(): assert clean_arxiv_id("math.CA/0611800v2") == "math.CA/0611800v2" assert clean_arxiv_id("math.CA/0611800") == "math.CA/0611800" assert clean_arxiv_id("0806.2878v1 ") == "0806.2878v1" + assert clean_arxiv_id("cs/0207047") == "cs/0207047" assert clean_arxiv_id("https://arxiv.org/abs/0806.2878v1") == "0806.2878v1" assert clean_arxiv_id("arxiv:0806.2878v1") == "0806.2878v1" |