From f0be35e8031032cda3a98a3b4a79d4e4d6866817 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 3 Jun 2019 20:22:25 -0700 Subject: more pubmed importer fixes --- python/fatcat_tools/importers/pubmed.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index e8405dd4..4bfbbc79 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -386,7 +386,10 @@ class PubmedImporter(EntityImporter): pmcid = identifiers.find("ArticleId", IdType="pmc") if pmcid: - pmcid = pmcid.string + pmcid = pmcid.string.strip().upper() + # got a bunch of weird ones like "wst_2018_399" in the 2019 baseline + if not pmcid.startswith("PMC"): + pmcid = None release_type = None pub_types = [] @@ -488,11 +491,15 @@ class PubmedImporter(EntityImporter): if pub_date.Year: release_year = int(pub_date.Year.string) if pub_date.find("Day") and pub_date.find("Month"): - release_date = datetime.date( - release_year, - MONTH_ABBR_MAP[pub_date.Month.string], - int(pub_date.Day.string)) - release_date = release_date.isoformat() + try: + release_date = datetime.date( + release_year, + MONTH_ABBR_MAP[pub_date.Month.string], + int(pub_date.Day.string)) + release_date = release_date.isoformat() + except ValueError as ve: + sys.stderr.write("bad date, skipping: {}\n".format(ve)) + release_date = None if journal.find("Title"): container_name = journal.Title.string -- cgit v1.2.3