diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-06-03 20:22:25 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-06-03 20:22:25 -0700 |
commit | f0be35e8031032cda3a98a3b4a79d4e4d6866817 (patch) | |
tree | 64df6c48d3655a77c5ec7ee88507c41bb5ca16ab | |
parent | dee48a8f1ad3599cefa044c476966929cd869cfa (diff) | |
download | fatcat-f0be35e8031032cda3a98a3b4a79d4e4d6866817.tar.gz fatcat-f0be35e8031032cda3a98a3b4a79d4e4d6866817.zip |
more pubmed importer fixes
-rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 19 |
1 files changed, 13 insertions, 6 deletions
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index e8405dd4..4bfbbc79 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -386,7 +386,10 @@ class PubmedImporter(EntityImporter): pmcid = identifiers.find("ArticleId", IdType="pmc") if pmcid: - pmcid = pmcid.string + pmcid = pmcid.string.strip().upper() + # got a bunch of weird ones like "wst_2018_399" in the 2019 baseline + if not pmcid.startswith("PMC"): + pmcid = None release_type = None pub_types = [] @@ -488,11 +491,15 @@ class PubmedImporter(EntityImporter): if pub_date.Year: release_year = int(pub_date.Year.string) if pub_date.find("Day") and pub_date.find("Month"): - release_date = datetime.date( - release_year, - MONTH_ABBR_MAP[pub_date.Month.string], - int(pub_date.Day.string)) - release_date = release_date.isoformat() + try: + release_date = datetime.date( + release_year, + MONTH_ABBR_MAP[pub_date.Month.string], + int(pub_date.Day.string)) + release_date = release_date.isoformat() + except ValueError as ve: + sys.stderr.write("bad date, skipping: {}\n".format(ve)) + release_date = None if journal.find("Title"): container_name = journal.Title.string |