diff options
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/harvest/doi_registrars.py | 32 | ||||
-rw-r--r-- | python/fatcat_tools/importers/chocula.py | 10 | ||||
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 19 | ||||
-rw-r--r-- | python/fatcat_tools/importers/orcid.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 92 | ||||
-rw-r--r-- | python/fatcat_tools/normal.py | 38 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/ingest.py | 8 |
7 files changed, 131 insertions, 72 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 13abb2e6..33f44600 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -8,6 +8,7 @@ import itertools import datetime import requests from confluent_kafka import Producer, KafkaException +from urllib.parse import urlparse, parse_qs from fatcat_tools.workers import most_recent_message from .harvest_common import HarvestState, requests_retry_session @@ -121,6 +122,10 @@ class HarvestCrossrefWorker: self.producer.poll(0) time.sleep(30.0) continue + if http_resp.status_code == 400: + print("skipping batch for {}, due to HTTP 400. Marking complete. Related: https://github.com/datacite/datacite/issues/897".format(date_str), + file=sys.stderr) + break http_resp.raise_for_status() resp = http_resp.json() items = self.extract_items(resp) @@ -179,7 +184,7 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): """ def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, - api_host_url="https://api.datacite.org/works", + api_host_url="https://api.datacite.org/dois", start_date=None, end_date=None): super().__init__(kafka_hosts=kafka_hosts, produce_topic=produce_topic, @@ -193,11 +198,13 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): self.name = "Datacite" def params(self, date_str): + """ + Dates have to be supplied in 2018-10-27T22:36:30.000Z format. + """ return { - 'from-update-date': date_str, - 'until-update-date': date_str, + 'query': 'updated:[{}T00:00:00.000Z TO {}T23:59:59.999Z]'.format(date_str, date_str), 'page[size]': self.api_batch_size, - 'page[number]': 1, + 'page[cursor]': 1, } def extract_items(self, resp): @@ -210,5 +217,20 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): return obj['attributes']['doi'].encode('utf-8') def update_params(self, params, resp): - params['page[number]'] = resp['meta']['page'] + 1 + """ + Using cursor mechanism (https://support.datacite.org/docs/pagination#section-cursor). + + $ curl -sL https://is.gd/cLbE5h | jq -r .links.next + + Example: https://is.gd/cLbE5h + + Further API errors reported: + https://github.com/datacite/datacite/issues/897 (HTTP 400) + https://github.com/datacite/datacite/issues/898 (HTTP 500) + """ + parsed = urlparse(resp['links']['next']) + page_cursor = parse_qs(parsed.query).get('page[cursor]') + if not page_cursor: + raise ValueError('no page[cursor] in .links.next') + params['page[cursor]'] = page_cursor[0] return params diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 6915ba98..eea50314 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -128,15 +128,15 @@ class ChoculaImporter(EntityImporter): existing.publisher = ce.publisher existing.container_type = existing.container_type or ce.container_type for k in ('urls', 'webarchive_urls'): - # update, or clobber/remove any existing values. often - # want/need to remove dead URLs + # update, which might clobber, but won't remove if ce.extra.get(k): existing.extra[k] = ce.extra.get(k, []) - elif k in existing.extra.keys(): - existing.extra.pop(k) + # note: in some cases we might *want* to clobber existing (if + # all URLs found to be bad), but being conservative for now so + # we don't clobber human edits for k in ('issne', 'issnp', 'country', 'sherpa_romeo', 'ezb', 'szczepanski', 'doaj'): - # update, but don't remove any existing value + # update/overwrite, but don't remove any existing value if ce.extra.get(k): existing.extra[k] = ce.extra[k] if ce.extra.get('languages'): diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index c47f0aa7..33c40eff 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -29,6 +29,7 @@ class IngestFileResultImporter(EntityImporter): self.ingest_request_source_whitelist = [ 'fatcat-changelog', 'fatcat-ingest-container', + 'arabesque', ] if kwargs.get('skip_source_whitelist', False): self.ingest_request_source_whitelist = [] @@ -55,6 +56,10 @@ class IngestFileResultImporter(EntityImporter): if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist: self.counts['skip-ingest_request_source'] += 1 return False + if source.startswith('arabesque'): + if row['reqeust'].get('link_source') not in ('arxiv', 'pmc'): + self.counts['skip-arabesque-source'] += 1 + return False if source.startswith('savepapernow'): # never process async savepapernow requests self.counts['skip-savepapernow'] += 1 @@ -152,20 +157,22 @@ class IngestFileResultImporter(EntityImporter): if err.status != 404: raise err + # check for existing edits-in-progress with same file hash + for other in self._entity_queue: + if other.sha1 == fe.sha1: + self.counts['skip-in-queue'] += 1 + return False + if not existing: return True + # the following checks all assume there is an existing item + if (fe.release_ids[0] in existing.release_ids) and existing.urls: # TODO: could still, in theory update with the new URL? self.counts['exists'] += 1 return False - # check for existing edits-in-progress with same file hash - for other in self._entity_queue: - if other.sha1 == fe.sha1: - self.counts['skip-in-queue'] += 1 - return False - if not self.do_updates: self.counts['skip-update-disabled'] += 1 return False diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 0a2c8610..554e052f 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -41,6 +41,10 @@ class OrcidImporter(EntityImporter): obj is a python dict (parsed from json). returns a CreatorEntity """ + + if not 'person' in obj: + return False + name = obj['person']['name'] if not name: return None diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 80cf986c..3611a299 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -8,6 +8,7 @@ from bs4 import BeautifulSoup from bs4.element import NavigableString import fatcat_openapi_client +from fatcat_tools.normal import * from .common import EntityImporter, clean, LANG_MAP_MARC # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly @@ -317,7 +318,7 @@ class PubmedImporter(EntityImporter): TODO: MEDLINE doesn't include PMC/OA license; could include in importer? """ - def __init__(self, api, issn_map_file, lookup_refs=False, **kwargs): + def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs): eg_desc = kwargs.get('editgroup_description', "Automated import of PubMed/MEDLINE XML metadata") @@ -330,38 +331,9 @@ class PubmedImporter(EntityImporter): **kwargs) self.lookup_refs = lookup_refs - extid_map_file = kwargs.get('extid_map_file') - self.extid_map_db = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri)) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map") - self.create_containers = kwargs.get('create_containers', True) self.read_issn_map_file(issn_map_file) - def lookup_ext_ids(self, pmid): - if self.extid_map_db is None: - return dict(doi=None, core_id=None, pmid=None, pmcid=None, - wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = self.extid_map_db.execute("SELECT core, doi, pmcid, wikidata FROM ids WHERE pmid=? LIMIT 1", - [pmid]).fetchone() - if row is None: - return dict(doi=None, core_id=None, pmid=None, pmcid=None, - wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = [str(cell or '') or None for cell in row] - return dict( - core_id=row[0], - doi=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def want(self, obj): return True @@ -376,20 +348,14 @@ class PubmedImporter(EntityImporter): identifiers = pubmed.ArticleIdList pmid = medline.PMID.string.strip() doi = identifiers.find("ArticleId", IdType="doi") - if doi: - doi = doi.string.lower().strip() - if doi.startswith('doi:'): - doi = doi[4:] - if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]) and len(doi.split()) == 1: - sys.stderr.write("BOGUS DOI: {}\n".format(doi)) - doi = None + if doi and doi.string: + doi = clean_doi(doi.string) + else: + doi = None pmcid = identifiers.find("ArticleId", IdType="pmc") if pmcid: - pmcid = pmcid.string.strip().upper() - # got a bunch of weird ones like "wst_2018_399" in the 2019 baseline - if not pmcid.startswith("PMC"): - pmcid = None + pmcid = clean_pmcid(pmcid.string.strip().upper()) release_type = None pub_types = [] @@ -486,6 +452,8 @@ class PubmedImporter(EntityImporter): pub_date = medline.Article.find('ArticleDate') if not pub_date: pub_date = journal.PubDate + if not pub_date: + pub_date = journal.JournalIssue.PubDate release_date = None release_year = None if pub_date.Year: @@ -498,8 +466,17 @@ class PubmedImporter(EntityImporter): int(pub_date.Day.string)) release_date = release_date.isoformat() except ValueError as ve: - sys.stderr.write("bad date, skipping: {}\n".format(ve)) + print("bad date, skipping: {}".format(ve), file=sys.stderr) release_date = None + elif pub_date.MedlineDate: + medline_date = pub_date.MedlineDate.string.strip() + if len(medline_date) >= 4 and medline_date[:4].isdigit(): + release_year = int(medline_date[:4]) + if release_year < 1300 or release_year > 2040: + print("bad medline year, skipping: {}".format(release_year), file=sys.stderr) + release_year = None + else: + print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr) if journal.find("Title"): container_name = journal.Title.string @@ -641,19 +618,21 @@ class PubmedImporter(EntityImporter): if pubmed.ReferenceList: for ref in pubmed.ReferenceList.find_all('Reference'): ref_extra = dict() - ref_pmid = ref.find("ArticleId", IdType="pubmed") ref_doi = ref.find("ArticleId", IdType="doi") - ref_release_id = None + if ref_doi: + ref_doi = clean_doi(ref_doi.string) + ref_pmid = ref.find("ArticleId", IdType="pubmed") if ref_pmid: - ref_pmid = ref_pmid.string.strip() - ref_extra['pmid'] = ref_pmid - if self.lookup_refs: - ref_release_id = self.lookup_pmid(ref_pmid) + ref_pmid = clean_pmid(ref_pmid.string) + ref_release_id = None if ref_doi: - ref_doi = ref_doi.string.lower().strip() ref_extra['doi'] = ref_doi if self.lookup_refs: ref_release_id = self.lookup_doi(ref_doi) + if ref_pmid: + ref_extra['pmid'] = ref_pmid + if self.lookup_refs: + ref_release_id = self.lookup_pmid(ref_pmid) ref_raw = ref.Citation if ref_raw: ref_extra['unstructured'] = ref_raw.string @@ -668,7 +647,6 @@ class PubmedImporter(EntityImporter): # extra: # translation_of - # subtitle # aliases # container_name # group-title @@ -729,8 +707,9 @@ class PubmedImporter(EntityImporter): if err.status != 404: raise err if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid: - warnings.warn("PMID/DOI mismatch: release {}, pmid {} != {}".format( - existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid)) + warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format( + existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid) + warnings.warn(warn_str) self.counts['warn-pmid-doi-mismatch'] += 1 # don't clobber DOI, but do group together re.ext_ids.doi = None @@ -748,6 +727,15 @@ class PubmedImporter(EntityImporter): existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid existing.refs = existing.refs or re.refs existing.extra['pubmed'] = re.extra['pubmed'] + # update subtitle in-place first + if not existing.subtitle and existing.extra.get('subtitle'): + subtitle = existing.extra.pop('subtitle') + if type(subtitle) == list: + subtitle = subtitle[0] + if subtitle: + existing.subtitle = subtitle + if not existing.subtitle: + existing.subtitle = re.subtitle try: self.api.update_release(self.get_editgroup_id(), existing.ident, existing) self.counts['update'] += 1 diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 80bcfa5a..a77c5eb0 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -19,7 +19,10 @@ def clean_doi(raw): Returns None if not a valid DOI """ + if not raw: + return None raw = raw.strip() + raw = raw.replace('\u2013', '-') # emdash if len(raw.split()) != 1: return None if raw.startswith("doi:"): @@ -32,6 +35,8 @@ def clean_doi(raw): raw = raw[8:] if raw.startswith("dx.doi.org/"): raw = raw[11:] + if raw[7:9] == "//": + raw = raw[:8] + raw[9:] if not raw.startswith("10."): return None if not DOI_REGEX.fullmatch(raw): @@ -40,6 +45,10 @@ def clean_doi(raw): def test_clean_doi(): assert clean_doi("10.1234/asdf ") == "10.1234/asdf" + assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" + assert clean_doi("10.1037/0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" + assert clean_doi("10.23750/abm.v88i2 -s.6506") == None + assert clean_doi("10.17167/mksz.2017.2.129–155") == "10.17167/mksz.2017.2.129-155" assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf" assert clean_doi("https://dx.doi.org/10.1234/asdf ") == "10.1234/asdf" assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf" @@ -54,6 +63,8 @@ def clean_arxiv_id(raw): Works with versioned or un-versioned arxiv identifiers. """ + if not raw: + return None raw = raw.strip() if raw.lower().startswith("arxiv:"): raw = raw[6:] @@ -90,7 +101,26 @@ def test_clean_arxiv_id(): assert clean_arxiv_id("0806.v1") == None assert clean_arxiv_id("08062878v1") == None +def clean_pmid(raw): + if not raw: + return None + raw = raw.strip() + if len(raw.split()) != 1: + return None + if raw.isdigit(): + return raw + return None + +def test_clean_pmid(): + assert clean_pmid("1234") == "1234" + assert clean_pmid("1234 ") == "1234" + assert clean_pmid("PMC123") == None + assert clean_sha1("qfba3") == None + assert clean_sha1("") == None + def clean_pmcid(raw): + if not raw: + return None raw = raw.strip() if len(raw.split()) != 1: return None @@ -99,6 +129,8 @@ def clean_pmcid(raw): return None def clean_sha1(raw): + if not raw: + return None raw = raw.strip().lower() if len(raw.split()) != 1: return None @@ -134,6 +166,8 @@ def test_clean_sha256(): ISSN_REGEX = re.compile("^\d{4}-\d{3}[0-9X]$") def clean_issn(raw): + if not raw: + return None raw = raw.strip().upper() if len(raw) != 9: return None @@ -150,6 +184,8 @@ def test_clean_issn(): ISBN13_REGEX = re.compile("^97(?:8|9)-\d{1,5}-\d{1,7}-\d{1,6}-\d$") def clean_isbn13(raw): + if not raw: + return None raw = raw.strip() if not ISBN13_REGEX.fullmatch(raw): return None @@ -164,6 +200,8 @@ def test_clean_isbn13(): ORCID_REGEX = re.compile("^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$") def clean_orcid(raw): + if not raw: + return None raw = raw.strip() if not ORCID_REGEX.fullmatch(raw): return None diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index e08d56b8..d6393753 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -35,12 +35,12 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat url = "https://doi.org/{}".format(release.ext_ids.doi) link_source = "doi" link_source_id = release.ext_ids.doi - elif release.ext_ids.pmcid and release.ext_ids.pmid: + elif release.ext_ids.pmcid: # TODO: how to tell if an author manuscript in PMC vs. published? #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid) url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid) - link_source = "pubmed" - link_source_id = release.ext_ids.pmid + link_source = "pmc" + link_source_id = release.ext_ids.pmcid if not url: return None @@ -48,7 +48,7 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat ext_ids = release.ext_ids.to_dict() ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) - if oa_only and link_source not in ('arxiv', 'pubmed'): + if oa_only and link_source not in ('arxiv', 'pmc'): es = release_to_elasticsearch(release) if not es['is_oa']: return None |