From 1bb0a2181d5a30241d80279c5930eb753733f30b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 23 Dec 2019 14:01:15 -0800 Subject: add basic MedlineDate year parsing --- python/fatcat_tools/importers/pubmed.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 80cf986c..aeac43b5 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -486,6 +486,8 @@ class PubmedImporter(EntityImporter): pub_date = medline.Article.find('ArticleDate') if not pub_date: pub_date = journal.PubDate + if not pub_date: + pub_date = journal.JournalIssue.PubDate release_date = None release_year = None if pub_date.Year: @@ -500,6 +502,15 @@ class PubmedImporter(EntityImporter): except ValueError as ve: sys.stderr.write("bad date, skipping: {}\n".format(ve)) release_date = None + elif pub_date.MedlineDate: + medline_date = pub_date.MedlineDate.string.strip() + if len(medline_date) >= 4 and medline_date[:4].isdigit(): + release_year = int(medline_date[:4]) + if release_year < 1300 or release_year > 2040: + print("bad medline year, skipping: {}\n".format(release_year), file=sys.stderr) + release_year = None + else: + print("unparsable medline date, skipping: {}\n".format(medline_date), file=sys.stderr) if journal.find("Title"): container_name = journal.Title.string -- cgit v1.2.3