summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/importers/pubmed.py19
1 files changed, 13 insertions, 6 deletions
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index e8405dd4..4bfbbc79 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -386,7 +386,10 @@ class PubmedImporter(EntityImporter):
pmcid = identifiers.find("ArticleId", IdType="pmc")
if pmcid:
- pmcid = pmcid.string
+ pmcid = pmcid.string.strip().upper()
+ # got a bunch of weird ones like "wst_2018_399" in the 2019 baseline
+ if not pmcid.startswith("PMC"):
+ pmcid = None
release_type = None
pub_types = []
@@ -488,11 +491,15 @@ class PubmedImporter(EntityImporter):
if pub_date.Year:
release_year = int(pub_date.Year.string)
if pub_date.find("Day") and pub_date.find("Month"):
- release_date = datetime.date(
- release_year,
- MONTH_ABBR_MAP[pub_date.Month.string],
- int(pub_date.Day.string))
- release_date = release_date.isoformat()
+ try:
+ release_date = datetime.date(
+ release_year,
+ MONTH_ABBR_MAP[pub_date.Month.string],
+ int(pub_date.Day.string))
+ release_date = release_date.isoformat()
+ except ValueError as ve:
+ sys.stderr.write("bad date, skipping: {}\n".format(ve))
+ release_date = None
if journal.find("Title"):
container_name = journal.Title.string