summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/arxiv.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-22 13:52:19 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-22 13:52:19 -0700
commit5e48498083cf5b7bccec03ba6352c6acbe6cdb00 (patch)
tree3a63e5b92c329638f1d31ca69658a443844311d0 /python/fatcat_tools/importers/arxiv.py
parent3e21211d65f86990193f63dcfcc71db81720b454 (diff)
downloadfatcat-5e48498083cf5b7bccec03ba6352c6acbe6cdb00.tar.gz
fatcat-5e48498083cf5b7bccec03ba6352c6acbe6cdb00.zip
improve arxiv author name parsing
Diffstat (limited to 'python/fatcat_tools/importers/arxiv.py')
-rw-r--r--python/fatcat_tools/importers/arxiv.py24
1 files changed, 24 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index 1c65890e..e6fb8039 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -1,4 +1,5 @@
+import re
import sys
import json
import datetime
@@ -21,12 +22,17 @@ def latex_to_text(raw):
def parse_arxiv_authors(raw):
if not raw:
return []
+ raw = raw.replace('*', '')
+ if '(' in raw:
+ raw = re.sub(r'\(.*\)', '', raw)
authors = raw.split(', ')
if authors:
last = authors[-1].split(" and ")
if len(last) == 2:
authors[-1] = last[0]
authors.append(last[1])
+ if authors[-1].startswith("and "):
+ authors[-1] = authors[-1][4:]
authors = [latex_to_text(a).strip() for a in authors]
return authors
@@ -42,10 +48,28 @@ def test_parse_arxiv_authors():
"Izaak Neri",
"Édgar Roldán",
]
+ assert parse_arxiv_authors("Izaak Neri, and \\'Edgar Rold\\'an") == [
+ "Izaak Neri",
+ "Édgar Roldán",
+ ]
+ assert parse_arxiv_authors("Izaak Neri, et al.") == [
+ "Izaak Neri",
+ "et al.",
+ ]
assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [
"Raphael Chetrite Shamik Gupta",
]
+ assert parse_arxiv_authors("B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V. James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)") == [
+ "B. P. Lanyon",
+ "T. J. Weinhold",
+ "N. K. Langford",
+ "M. Barbieri",
+ "D. F. V. James",
+ "A. Gilchrist",
+ "A. G. White",
+ ]
+
class ArxivRawImporter(EntityImporter):
"""