summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-06-03 20:22:25 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-06-03 20:22:25 -0700
commitf0be35e8031032cda3a98a3b4a79d4e4d6866817 (patch)
tree64df6c48d3655a77c5ec7ee88507c41bb5ca16ab
parentdee48a8f1ad3599cefa044c476966929cd869cfa (diff)
downloadfatcat-f0be35e8031032cda3a98a3b4a79d4e4d6866817.tar.gz
fatcat-f0be35e8031032cda3a98a3b4a79d4e4d6866817.zip
more pubmed importer fixes
-rw-r--r--python/fatcat_tools/importers/pubmed.py19
1 files changed, 13 insertions, 6 deletions
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index e8405dd4..4bfbbc79 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -386,7 +386,10 @@ class PubmedImporter(EntityImporter):
pmcid = identifiers.find("ArticleId", IdType="pmc")
if pmcid:
- pmcid = pmcid.string
+ pmcid = pmcid.string.strip().upper()
+ # got a bunch of weird ones like "wst_2018_399" in the 2019 baseline
+ if not pmcid.startswith("PMC"):
+ pmcid = None
release_type = None
pub_types = []
@@ -488,11 +491,15 @@ class PubmedImporter(EntityImporter):
if pub_date.Year:
release_year = int(pub_date.Year.string)
if pub_date.find("Day") and pub_date.find("Month"):
- release_date = datetime.date(
- release_year,
- MONTH_ABBR_MAP[pub_date.Month.string],
- int(pub_date.Day.string))
- release_date = release_date.isoformat()
+ try:
+ release_date = datetime.date(
+ release_year,
+ MONTH_ABBR_MAP[pub_date.Month.string],
+ int(pub_date.Day.string))
+ release_date = release_date.isoformat()
+ except ValueError as ve:
+ sys.stderr.write("bad date, skipping: {}\n".format(ve))
+ release_date = None
if journal.find("Title"):
container_name = journal.Title.string