diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-06 17:30:20 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-06 17:30:20 -0800 |
commit | a53e7dc054a77c9fef4fa5d0e0638777d9faff71 (patch) | |
tree | 39fd760b301706f744d73810266f0754b90204c0 | |
parent | 9a944bfb6d994fe2f6865c5b9117920ed99cc5f1 (diff) | |
download | fatcat-a53e7dc054a77c9fef4fa5d0e0638777d9faff71.tar.gz fatcat-a53e7dc054a77c9fef4fa5d0e0638777d9faff71.zip |
small fixes to arxivraw parser
-rw-r--r-- | python/parse_arxivraw_xml.py | 5 |
1 files changed, 3 insertions, 2 deletions
diff --git a/python/parse_arxivraw_xml.py b/python/parse_arxivraw_xml.py index e2fab510..16def821 100644 --- a/python/parse_arxivraw_xml.py +++ b/python/parse_arxivraw_xml.py @@ -118,6 +118,7 @@ class ArxivRawXmlParser(): license_slug = metadata.license.string.strip() abstracts = None if metadata.abstract: + # TODO: test for this multi-abstract code path abstracts = [] abst = metadata.abstract.string.strip() orig = None @@ -144,14 +145,14 @@ class ArxivRawXmlParser(): for version in metadata.find_all('version'): arxiv_id = base_id + version['version'] release_date = version.date.string.strip() - release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z") + release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date() versions.append(dict( work_id=None, title=title, #original_title release_type="article-journal", release_status='submitted', # XXX: source_type? - release_date=release_date.isoformat() + "Z", + release_date=release_date.isoformat(), release_year=release_date.year, arxiv_id=arxiv_id, #doi (see below) |