diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-12-23 18:13:43 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-12-23 18:18:34 -0800 | 
| commit | 05ce151586ec69edc8450ea7d901045798de19f6 (patch) | |
| tree | 336db438967bc1133569584f8d31bb1e42d12f5b | |
| parent | cf62de67f788f19efb629413376eb2502d85d041 (diff) | |
| download | fatcat-05ce151586ec69edc8450ea7d901045798de19f6.tar.gz fatcat-05ce151586ec69edc8450ea7d901045798de19f6.zip | |
pubmed: use standard identifier cleaners
| -rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 31 | 
1 files changed, 14 insertions, 17 deletions
| diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 59b65b19..ced78d5d 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -8,6 +8,7 @@ from bs4 import BeautifulSoup  from bs4.element import NavigableString  import fatcat_openapi_client +from fatcat_tools.normal import *  from .common import EntityImporter, clean, LANG_MAP_MARC  # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly @@ -348,19 +349,13 @@ class PubmedImporter(EntityImporter):          pmid = medline.PMID.string.strip()          doi = identifiers.find("ArticleId", IdType="doi")          if doi and doi.string: -            doi = doi.string.lower().strip() -            if doi.startswith('doi:'): -                doi = doi[4:] -            if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]) and len(doi.split()) == 1: -                sys.stderr.write("BOGUS DOI: {}\n".format(doi)) -                doi = None +            doi = clean_doi(doi.string) +        else: +            doi = None          pmcid = identifiers.find("ArticleId", IdType="pmc")          if pmcid: -            pmcid = pmcid.string.strip().upper() -            # got a bunch of weird ones like "wst_2018_399" in the 2019 baseline -            if not pmcid.startswith("PMC"): -                pmcid = None +            pmcid = clean_pmcid(pmcid.string.strip().upper())          release_type = None          pub_types = [] @@ -623,19 +618,21 @@ class PubmedImporter(EntityImporter):          if pubmed.ReferenceList:              for ref in pubmed.ReferenceList.find_all('Reference'):                  ref_extra = dict() -                ref_pmid = ref.find("ArticleId", IdType="pubmed")                  ref_doi = ref.find("ArticleId", IdType="doi") -                ref_release_id = None +                if ref_doi: +                    ref_doi = clean_doi(ref_doi.string) +                ref_pmid = ref.find("ArticleId", IdType="pubmed")                  if ref_pmid: -                    ref_pmid = ref_pmid.string.strip() -                    ref_extra['pmid'] = ref_pmid -                    if self.lookup_refs: -                        ref_release_id = self.lookup_pmid(ref_pmid) +                    ref_pmid = clean_pmid(ref_pmid.string) +                ref_release_id = None                  if ref_doi: -                    ref_doi = ref_doi.string.lower().strip()                      ref_extra['doi'] = ref_doi                      if self.lookup_refs:                          ref_release_id = self.lookup_doi(ref_doi) +                if ref_pmid: +                    ref_extra['pmid'] = ref_pmid +                    if self.lookup_refs: +                        ref_release_id = self.lookup_pmid(ref_pmid)                  ref_raw = ref.Citation                  if ref_raw:                      ref_extra['unstructured'] = ref_raw.string | 
