diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-06-03 20:22:25 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-06-03 20:22:25 -0700 | 
| commit | f0be35e8031032cda3a98a3b4a79d4e4d6866817 (patch) | |
| tree | 64df6c48d3655a77c5ec7ee88507c41bb5ca16ab /python/fatcat_tools | |
| parent | dee48a8f1ad3599cefa044c476966929cd869cfa (diff) | |
| download | fatcat-f0be35e8031032cda3a98a3b4a79d4e4d6866817.tar.gz fatcat-f0be35e8031032cda3a98a3b4a79d4e4d6866817.zip | |
more pubmed importer fixes
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 19 | 
1 files changed, 13 insertions, 6 deletions
| diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index e8405dd4..4bfbbc79 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -386,7 +386,10 @@ class PubmedImporter(EntityImporter):          pmcid = identifiers.find("ArticleId", IdType="pmc")          if pmcid: -            pmcid = pmcid.string +            pmcid = pmcid.string.strip().upper() +            # got a bunch of weird ones like "wst_2018_399" in the 2019 baseline +            if not pmcid.startswith("PMC"): +                pmcid = None          release_type = None          pub_types = [] @@ -488,11 +491,15 @@ class PubmedImporter(EntityImporter):          if pub_date.Year:              release_year = int(pub_date.Year.string)              if pub_date.find("Day") and pub_date.find("Month"): -                release_date = datetime.date( -                    release_year, -                    MONTH_ABBR_MAP[pub_date.Month.string], -                    int(pub_date.Day.string)) -                release_date = release_date.isoformat() +                try: +                    release_date = datetime.date( +                        release_year, +                        MONTH_ABBR_MAP[pub_date.Month.string], +                        int(pub_date.Day.string)) +                    release_date = release_date.isoformat() +                except ValueError as ve: +                    sys.stderr.write("bad date, skipping: {}\n".format(ve)) +                    release_date = None          if journal.find("Title"):              container_name = journal.Title.string | 
