diff options
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 19 | 
1 files changed, 13 insertions, 6 deletions
| diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index e8405dd4..4bfbbc79 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -386,7 +386,10 @@ class PubmedImporter(EntityImporter):          pmcid = identifiers.find("ArticleId", IdType="pmc")          if pmcid: -            pmcid = pmcid.string +            pmcid = pmcid.string.strip().upper() +            # got a bunch of weird ones like "wst_2018_399" in the 2019 baseline +            if not pmcid.startswith("PMC"): +                pmcid = None          release_type = None          pub_types = [] @@ -488,11 +491,15 @@ class PubmedImporter(EntityImporter):          if pub_date.Year:              release_year = int(pub_date.Year.string)              if pub_date.find("Day") and pub_date.find("Month"): -                release_date = datetime.date( -                    release_year, -                    MONTH_ABBR_MAP[pub_date.Month.string], -                    int(pub_date.Day.string)) -                release_date = release_date.isoformat() +                try: +                    release_date = datetime.date( +                        release_year, +                        MONTH_ABBR_MAP[pub_date.Month.string], +                        int(pub_date.Day.string)) +                    release_date = release_date.isoformat() +                except ValueError as ve: +                    sys.stderr.write("bad date, skipping: {}\n".format(ve)) +                    release_date = None          if journal.find("Title"):              container_name = journal.Title.string | 
