diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-22 13:52:19 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-22 13:52:19 -0700 |
commit | 5e48498083cf5b7bccec03ba6352c6acbe6cdb00 (patch) | |
tree | 3a63e5b92c329638f1d31ca69658a443844311d0 | |
parent | 3e21211d65f86990193f63dcfcc71db81720b454 (diff) | |
download | fatcat-5e48498083cf5b7bccec03ba6352c6acbe6cdb00.tar.gz fatcat-5e48498083cf5b7bccec03ba6352c6acbe6cdb00.zip |
improve arxiv author name parsing
-rw-r--r-- | python/fatcat_tools/importers/arxiv.py | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 1c65890e..e6fb8039 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -1,4 +1,5 @@ +import re import sys import json import datetime @@ -21,12 +22,17 @@ def latex_to_text(raw): def parse_arxiv_authors(raw): if not raw: return [] + raw = raw.replace('*', '') + if '(' in raw: + raw = re.sub(r'\(.*\)', '', raw) authors = raw.split(', ') if authors: last = authors[-1].split(" and ") if len(last) == 2: authors[-1] = last[0] authors.append(last[1]) + if authors[-1].startswith("and "): + authors[-1] = authors[-1][4:] authors = [latex_to_text(a).strip() for a in authors] return authors @@ -42,10 +48,28 @@ def test_parse_arxiv_authors(): "Izaak Neri", "Édgar Roldán", ] + assert parse_arxiv_authors("Izaak Neri, and \\'Edgar Rold\\'an") == [ + "Izaak Neri", + "Édgar Roldán", + ] + assert parse_arxiv_authors("Izaak Neri, et al.") == [ + "Izaak Neri", + "et al.", + ] assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [ "Raphael Chetrite Shamik Gupta", ] + assert parse_arxiv_authors("B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V. James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)") == [ + "B. P. Lanyon", + "T. J. Weinhold", + "N. K. Langford", + "M. Barbieri", + "D. F. V. James", + "A. Gilchrist", + "A. G. White", + ] + class ArxivRawImporter(EntityImporter): """ |