diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-22 13:52:19 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-22 13:52:19 -0700 | 
| commit | 5e48498083cf5b7bccec03ba6352c6acbe6cdb00 (patch) | |
| tree | 3a63e5b92c329638f1d31ca69658a443844311d0 /python | |
| parent | 3e21211d65f86990193f63dcfcc71db81720b454 (diff) | |
| download | fatcat-5e48498083cf5b7bccec03ba6352c6acbe6cdb00.tar.gz fatcat-5e48498083cf5b7bccec03ba6352c6acbe6cdb00.zip | |
improve arxiv author name parsing
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/importers/arxiv.py | 24 | 
1 files changed, 24 insertions, 0 deletions
| diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 1c65890e..e6fb8039 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -1,4 +1,5 @@ +import re  import sys  import json  import datetime @@ -21,12 +22,17 @@ def latex_to_text(raw):  def parse_arxiv_authors(raw):      if not raw:          return [] +    raw = raw.replace('*', '') +    if '(' in raw: +        raw = re.sub(r'\(.*\)', '', raw)      authors = raw.split(', ')      if authors:          last = authors[-1].split(" and ")          if len(last) == 2:              authors[-1] = last[0]              authors.append(last[1]) +        if authors[-1].startswith("and "): +            authors[-1] = authors[-1][4:]      authors = [latex_to_text(a).strip() for a in authors]      return authors @@ -42,10 +48,28 @@ def test_parse_arxiv_authors():          "Izaak Neri",          "Édgar Roldán",      ] +    assert parse_arxiv_authors("Izaak Neri, and \\'Edgar Rold\\'an") == [ +        "Izaak Neri", +        "Édgar Roldán", +    ] +    assert parse_arxiv_authors("Izaak Neri, et al.") == [ +        "Izaak Neri", +        "et al.", +    ]      assert parse_arxiv_authors("Raphael Chetrite Shamik Gupta") == [          "Raphael Chetrite Shamik Gupta",      ] +    assert parse_arxiv_authors("B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V.  James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)") == [ +        "B. P. Lanyon", +        "T. J. Weinhold", +        "N. K. Langford", +        "M. Barbieri", +        "D. F. V.  James", +        "A. Gilchrist", +        "A. G. White", +    ] +  class ArxivRawImporter(EntityImporter):      """ | 
