summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/chocula.py10
-rw-r--r--python/fatcat_tools/importers/ingest.py19
-rw-r--r--python/fatcat_tools/importers/orcid.py4
-rw-r--r--python/fatcat_tools/importers/pubmed.py92
4 files changed, 62 insertions, 63 deletions
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 6915ba98..eea50314 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -128,15 +128,15 @@ class ChoculaImporter(EntityImporter):
existing.publisher = ce.publisher
existing.container_type = existing.container_type or ce.container_type
for k in ('urls', 'webarchive_urls'):
- # update, or clobber/remove any existing values. often
- # want/need to remove dead URLs
+ # update, which might clobber, but won't remove
if ce.extra.get(k):
existing.extra[k] = ce.extra.get(k, [])
- elif k in existing.extra.keys():
- existing.extra.pop(k)
+ # note: in some cases we might *want* to clobber existing (if
+ # all URLs found to be bad), but being conservative for now so
+ # we don't clobber human edits
for k in ('issne', 'issnp', 'country', 'sherpa_romeo', 'ezb',
'szczepanski', 'doaj'):
- # update, but don't remove any existing value
+ # update/overwrite, but don't remove any existing value
if ce.extra.get(k):
existing.extra[k] = ce.extra[k]
if ce.extra.get('languages'):
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index c47f0aa7..33c40eff 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -29,6 +29,7 @@ class IngestFileResultImporter(EntityImporter):
self.ingest_request_source_whitelist = [
'fatcat-changelog',
'fatcat-ingest-container',
+ 'arabesque',
]
if kwargs.get('skip_source_whitelist', False):
self.ingest_request_source_whitelist = []
@@ -55,6 +56,10 @@ class IngestFileResultImporter(EntityImporter):
if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist:
self.counts['skip-ingest_request_source'] += 1
return False
+ if source.startswith('arabesque'):
+ if row['reqeust'].get('link_source') not in ('arxiv', 'pmc'):
+ self.counts['skip-arabesque-source'] += 1
+ return False
if source.startswith('savepapernow'):
# never process async savepapernow requests
self.counts['skip-savepapernow'] += 1
@@ -152,20 +157,22 @@ class IngestFileResultImporter(EntityImporter):
if err.status != 404:
raise err
+ # check for existing edits-in-progress with same file hash
+ for other in self._entity_queue:
+ if other.sha1 == fe.sha1:
+ self.counts['skip-in-queue'] += 1
+ return False
+
if not existing:
return True
+ # the following checks all assume there is an existing item
+
if (fe.release_ids[0] in existing.release_ids) and existing.urls:
# TODO: could still, in theory update with the new URL?
self.counts['exists'] += 1
return False
- # check for existing edits-in-progress with same file hash
- for other in self._entity_queue:
- if other.sha1 == fe.sha1:
- self.counts['skip-in-queue'] += 1
- return False
-
if not self.do_updates:
self.counts['skip-update-disabled'] += 1
return False
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 0a2c8610..554e052f 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -41,6 +41,10 @@ class OrcidImporter(EntityImporter):
obj is a python dict (parsed from json).
returns a CreatorEntity
"""
+
+ if not 'person' in obj:
+ return False
+
name = obj['person']['name']
if not name:
return None
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 80cf986c..3611a299 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -8,6 +8,7 @@ from bs4 import BeautifulSoup
from bs4.element import NavigableString
import fatcat_openapi_client
+from fatcat_tools.normal import *
from .common import EntityImporter, clean, LANG_MAP_MARC
# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
@@ -317,7 +318,7 @@ class PubmedImporter(EntityImporter):
TODO: MEDLINE doesn't include PMC/OA license; could include in importer?
"""
- def __init__(self, api, issn_map_file, lookup_refs=False, **kwargs):
+ def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs):
eg_desc = kwargs.get('editgroup_description',
"Automated import of PubMed/MEDLINE XML metadata")
@@ -330,38 +331,9 @@ class PubmedImporter(EntityImporter):
**kwargs)
self.lookup_refs = lookup_refs
- extid_map_file = kwargs.get('extid_map_file')
- self.extid_map_db = None
- if extid_map_file:
- db_uri = "file:{}?mode=ro".format(extid_map_file)
- print("Using external ID map: {}".format(db_uri))
- self.extid_map_db = sqlite3.connect(db_uri, uri=True)
- else:
- print("Not using external ID map")
-
self.create_containers = kwargs.get('create_containers', True)
self.read_issn_map_file(issn_map_file)
- def lookup_ext_ids(self, pmid):
- if self.extid_map_db is None:
- return dict(doi=None, core_id=None, pmid=None, pmcid=None,
- wikidata_qid=None, arxiv_id=None, jstor_id=None)
- row = self.extid_map_db.execute("SELECT core, doi, pmcid, wikidata FROM ids WHERE pmid=? LIMIT 1",
- [pmid]).fetchone()
- if row is None:
- return dict(doi=None, core_id=None, pmid=None, pmcid=None,
- wikidata_qid=None, arxiv_id=None, jstor_id=None)
- row = [str(cell or '') or None for cell in row]
- return dict(
- core_id=row[0],
- doi=row[1],
- pmcid=row[2],
- wikidata_qid=row[3],
- # TODO:
- arxiv_id=None,
- jstor_id=None,
- )
-
def want(self, obj):
return True
@@ -376,20 +348,14 @@ class PubmedImporter(EntityImporter):
identifiers = pubmed.ArticleIdList
pmid = medline.PMID.string.strip()
doi = identifiers.find("ArticleId", IdType="doi")
- if doi:
- doi = doi.string.lower().strip()
- if doi.startswith('doi:'):
- doi = doi[4:]
- if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]) and len(doi.split()) == 1:
- sys.stderr.write("BOGUS DOI: {}\n".format(doi))
- doi = None
+ if doi and doi.string:
+ doi = clean_doi(doi.string)
+ else:
+ doi = None
pmcid = identifiers.find("ArticleId", IdType="pmc")
if pmcid:
- pmcid = pmcid.string.strip().upper()
- # got a bunch of weird ones like "wst_2018_399" in the 2019 baseline
- if not pmcid.startswith("PMC"):
- pmcid = None
+ pmcid = clean_pmcid(pmcid.string.strip().upper())
release_type = None
pub_types = []
@@ -486,6 +452,8 @@ class PubmedImporter(EntityImporter):
pub_date = medline.Article.find('ArticleDate')
if not pub_date:
pub_date = journal.PubDate
+ if not pub_date:
+ pub_date = journal.JournalIssue.PubDate
release_date = None
release_year = None
if pub_date.Year:
@@ -498,8 +466,17 @@ class PubmedImporter(EntityImporter):
int(pub_date.Day.string))
release_date = release_date.isoformat()
except ValueError as ve:
- sys.stderr.write("bad date, skipping: {}\n".format(ve))
+ print("bad date, skipping: {}".format(ve), file=sys.stderr)
release_date = None
+ elif pub_date.MedlineDate:
+ medline_date = pub_date.MedlineDate.string.strip()
+ if len(medline_date) >= 4 and medline_date[:4].isdigit():
+ release_year = int(medline_date[:4])
+ if release_year < 1300 or release_year > 2040:
+ print("bad medline year, skipping: {}".format(release_year), file=sys.stderr)
+ release_year = None
+ else:
+ print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr)
if journal.find("Title"):
container_name = journal.Title.string
@@ -641,19 +618,21 @@ class PubmedImporter(EntityImporter):
if pubmed.ReferenceList:
for ref in pubmed.ReferenceList.find_all('Reference'):
ref_extra = dict()
- ref_pmid = ref.find("ArticleId", IdType="pubmed")
ref_doi = ref.find("ArticleId", IdType="doi")
- ref_release_id = None
+ if ref_doi:
+ ref_doi = clean_doi(ref_doi.string)
+ ref_pmid = ref.find("ArticleId", IdType="pubmed")
if ref_pmid:
- ref_pmid = ref_pmid.string.strip()
- ref_extra['pmid'] = ref_pmid
- if self.lookup_refs:
- ref_release_id = self.lookup_pmid(ref_pmid)
+ ref_pmid = clean_pmid(ref_pmid.string)
+ ref_release_id = None
if ref_doi:
- ref_doi = ref_doi.string.lower().strip()
ref_extra['doi'] = ref_doi
if self.lookup_refs:
ref_release_id = self.lookup_doi(ref_doi)
+ if ref_pmid:
+ ref_extra['pmid'] = ref_pmid
+ if self.lookup_refs:
+ ref_release_id = self.lookup_pmid(ref_pmid)
ref_raw = ref.Citation
if ref_raw:
ref_extra['unstructured'] = ref_raw.string
@@ -668,7 +647,6 @@ class PubmedImporter(EntityImporter):
# extra:
# translation_of
- # subtitle
# aliases
# container_name
# group-title
@@ -729,8 +707,9 @@ class PubmedImporter(EntityImporter):
if err.status != 404:
raise err
if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid:
- warnings.warn("PMID/DOI mismatch: release {}, pmid {} != {}".format(
- existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid))
+ warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format(
+ existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid)
+ warnings.warn(warn_str)
self.counts['warn-pmid-doi-mismatch'] += 1
# don't clobber DOI, but do group together
re.ext_ids.doi = None
@@ -748,6 +727,15 @@ class PubmedImporter(EntityImporter):
existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid
existing.refs = existing.refs or re.refs
existing.extra['pubmed'] = re.extra['pubmed']
+ # update subtitle in-place first
+ if not existing.subtitle and existing.extra.get('subtitle'):
+ subtitle = existing.extra.pop('subtitle')
+ if type(subtitle) == list:
+ subtitle = subtitle[0]
+ if subtitle:
+ existing.subtitle = subtitle
+ if not existing.subtitle:
+ existing.subtitle = re.subtitle
try:
self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
self.counts['update'] += 1