diff options
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/chocula.py | 10 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/ingest.py | 19 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/orcid.py | 4 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 92 | 
4 files changed, 62 insertions, 63 deletions
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 6915ba98..eea50314 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -128,15 +128,15 @@ class ChoculaImporter(EntityImporter):              existing.publisher = ce.publisher              existing.container_type = existing.container_type or ce.container_type              for k in ('urls', 'webarchive_urls'): -                # update, or clobber/remove any existing values. often -                # want/need to remove dead URLs +                # update, which might clobber, but won't remove                  if ce.extra.get(k):                      existing.extra[k] = ce.extra.get(k, []) -                elif k in existing.extra.keys(): -                    existing.extra.pop(k) +                # note: in some cases we might *want* to clobber existing (if +                # all URLs found to be bad), but being conservative for now so +                # we don't clobber human edits              for k in ('issne', 'issnp', 'country', 'sherpa_romeo', 'ezb',                        'szczepanski', 'doaj'): -                # update, but don't remove any existing value +                # update/overwrite, but don't remove any existing value                  if ce.extra.get(k):                      existing.extra[k] = ce.extra[k]              if ce.extra.get('languages'): diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index c47f0aa7..33c40eff 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -29,6 +29,7 @@ class IngestFileResultImporter(EntityImporter):          self.ingest_request_source_whitelist = [              'fatcat-changelog',              'fatcat-ingest-container', +            'arabesque',          ]          if kwargs.get('skip_source_whitelist', False):              self.ingest_request_source_whitelist = [] @@ -55,6 +56,10 @@ class IngestFileResultImporter(EntityImporter):          if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist:              self.counts['skip-ingest_request_source'] += 1              return False +        if source.startswith('arabesque'): +            if row['reqeust'].get('link_source') not in ('arxiv', 'pmc'): +                self.counts['skip-arabesque-source'] += 1 +                return False          if source.startswith('savepapernow'):              # never process async savepapernow requests              self.counts['skip-savepapernow'] += 1 @@ -152,20 +157,22 @@ class IngestFileResultImporter(EntityImporter):              if err.status != 404:                  raise err +        # check for existing edits-in-progress with same file hash +        for other in self._entity_queue: +            if other.sha1 == fe.sha1: +                self.counts['skip-in-queue'] += 1 +                return False +          if not existing:              return True +        # the following checks all assume there is an existing item +          if (fe.release_ids[0] in existing.release_ids) and existing.urls:              # TODO: could still, in theory update with the new URL?              self.counts['exists'] += 1              return False -        # check for existing edits-in-progress with same file hash -        for other in self._entity_queue: -            if other.sha1 == fe.sha1: -                self.counts['skip-in-queue'] += 1 -                return False -          if not self.do_updates:              self.counts['skip-update-disabled'] += 1              return False diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 0a2c8610..554e052f 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -41,6 +41,10 @@ class OrcidImporter(EntityImporter):          obj is a python dict (parsed from json).          returns a CreatorEntity          """ + +        if not 'person' in obj: +            return False +          name = obj['person']['name']          if not name:              return None diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 80cf986c..3611a299 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -8,6 +8,7 @@ from bs4 import BeautifulSoup  from bs4.element import NavigableString  import fatcat_openapi_client +from fatcat_tools.normal import *  from .common import EntityImporter, clean, LANG_MAP_MARC  # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly @@ -317,7 +318,7 @@ class PubmedImporter(EntityImporter):      TODO: MEDLINE doesn't include PMC/OA license; could include in importer?      """ -    def __init__(self, api, issn_map_file, lookup_refs=False, **kwargs): +    def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs):          eg_desc = kwargs.get('editgroup_description',              "Automated import of PubMed/MEDLINE XML metadata") @@ -330,38 +331,9 @@ class PubmedImporter(EntityImporter):              **kwargs)          self.lookup_refs = lookup_refs -        extid_map_file = kwargs.get('extid_map_file') -        self.extid_map_db = None -        if extid_map_file: -            db_uri = "file:{}?mode=ro".format(extid_map_file) -            print("Using external ID map: {}".format(db_uri)) -            self.extid_map_db = sqlite3.connect(db_uri, uri=True) -        else: -            print("Not using external ID map") -          self.create_containers = kwargs.get('create_containers', True)          self.read_issn_map_file(issn_map_file) -    def lookup_ext_ids(self, pmid): -        if self.extid_map_db is None: -            return dict(doi=None, core_id=None, pmid=None, pmcid=None, -                wikidata_qid=None, arxiv_id=None, jstor_id=None) -        row = self.extid_map_db.execute("SELECT core, doi, pmcid, wikidata FROM ids WHERE pmid=? LIMIT 1", -            [pmid]).fetchone() -        if row is None: -            return dict(doi=None, core_id=None, pmid=None, pmcid=None, -                wikidata_qid=None, arxiv_id=None, jstor_id=None) -        row = [str(cell or '') or None for cell in row] -        return dict( -            core_id=row[0], -            doi=row[1], -            pmcid=row[2], -            wikidata_qid=row[3], -            # TODO: -            arxiv_id=None, -            jstor_id=None, -        ) -      def want(self, obj):          return True @@ -376,20 +348,14 @@ class PubmedImporter(EntityImporter):          identifiers = pubmed.ArticleIdList          pmid = medline.PMID.string.strip()          doi = identifiers.find("ArticleId", IdType="doi") -        if doi: -            doi = doi.string.lower().strip() -            if doi.startswith('doi:'): -                doi = doi[4:] -            if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]) and len(doi.split()) == 1: -                sys.stderr.write("BOGUS DOI: {}\n".format(doi)) -                doi = None +        if doi and doi.string: +            doi = clean_doi(doi.string) +        else: +            doi = None          pmcid = identifiers.find("ArticleId", IdType="pmc")          if pmcid: -            pmcid = pmcid.string.strip().upper() -            # got a bunch of weird ones like "wst_2018_399" in the 2019 baseline -            if not pmcid.startswith("PMC"): -                pmcid = None +            pmcid = clean_pmcid(pmcid.string.strip().upper())          release_type = None          pub_types = [] @@ -486,6 +452,8 @@ class PubmedImporter(EntityImporter):          pub_date = medline.Article.find('ArticleDate')          if not pub_date:              pub_date = journal.PubDate +        if not pub_date: +            pub_date = journal.JournalIssue.PubDate          release_date = None          release_year = None          if pub_date.Year: @@ -498,8 +466,17 @@ class PubmedImporter(EntityImporter):                          int(pub_date.Day.string))                      release_date = release_date.isoformat()                  except ValueError as ve: -                    sys.stderr.write("bad date, skipping: {}\n".format(ve)) +                    print("bad date, skipping: {}".format(ve), file=sys.stderr)                      release_date = None +        elif pub_date.MedlineDate: +            medline_date = pub_date.MedlineDate.string.strip() +            if len(medline_date) >= 4 and medline_date[:4].isdigit(): +                release_year = int(medline_date[:4]) +                if release_year < 1300 or release_year > 2040: +                    print("bad medline year, skipping: {}".format(release_year), file=sys.stderr) +                    release_year = None +            else: +                print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr)          if journal.find("Title"):              container_name = journal.Title.string @@ -641,19 +618,21 @@ class PubmedImporter(EntityImporter):          if pubmed.ReferenceList:              for ref in pubmed.ReferenceList.find_all('Reference'):                  ref_extra = dict() -                ref_pmid = ref.find("ArticleId", IdType="pubmed")                  ref_doi = ref.find("ArticleId", IdType="doi") -                ref_release_id = None +                if ref_doi: +                    ref_doi = clean_doi(ref_doi.string) +                ref_pmid = ref.find("ArticleId", IdType="pubmed")                  if ref_pmid: -                    ref_pmid = ref_pmid.string.strip() -                    ref_extra['pmid'] = ref_pmid -                    if self.lookup_refs: -                        ref_release_id = self.lookup_pmid(ref_pmid) +                    ref_pmid = clean_pmid(ref_pmid.string) +                ref_release_id = None                  if ref_doi: -                    ref_doi = ref_doi.string.lower().strip()                      ref_extra['doi'] = ref_doi                      if self.lookup_refs:                          ref_release_id = self.lookup_doi(ref_doi) +                if ref_pmid: +                    ref_extra['pmid'] = ref_pmid +                    if self.lookup_refs: +                        ref_release_id = self.lookup_pmid(ref_pmid)                  ref_raw = ref.Citation                  if ref_raw:                      ref_extra['unstructured'] = ref_raw.string @@ -668,7 +647,6 @@ class PubmedImporter(EntityImporter):          # extra:          #   translation_of -        #   subtitle          #   aliases          #   container_name          #   group-title @@ -729,8 +707,9 @@ class PubmedImporter(EntityImporter):                  if err.status != 404:                      raise err              if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid: -                warnings.warn("PMID/DOI mismatch: release {}, pmid {} != {}".format( -                    existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid)) +                warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format( +                    existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid) +                warnings.warn(warn_str)                  self.counts['warn-pmid-doi-mismatch'] += 1                  # don't clobber DOI, but do group together                  re.ext_ids.doi = None @@ -748,6 +727,15 @@ class PubmedImporter(EntityImporter):              existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid              existing.refs = existing.refs or re.refs              existing.extra['pubmed'] = re.extra['pubmed'] +            # update subtitle in-place first +            if not existing.subtitle and existing.extra.get('subtitle'): +                subtitle = existing.extra.pop('subtitle') +                if type(subtitle) == list: +                    subtitle = subtitle[0] +                if subtitle: +                    existing.subtitle = subtitle +            if not existing.subtitle: +                existing.subtitle = re.subtitle              try:                  self.api.update_release(self.get_editgroup_id(), existing.ident, existing)                  self.counts['update'] += 1  | 
