aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-29 18:13:35 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-29 18:13:35 -0700
commitab9f5664a89cb19fbe17b42332d2d7284ab9c416 (patch)
treebe196056ac1d1d2b3c23e99520c58afec3b452b1 /python/fatcat_tools
parenta73065f7ee22cc02d70d8e144af39954b9c6594a (diff)
downloadfatcat-ab9f5664a89cb19fbe17b42332d2d7284ab9c416.tar.gz
fatcat-ab9f5664a89cb19fbe17b42332d2d7284ab9c416.zip
improve JALC author handling
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/jalc.py144
1 files changed, 85 insertions, 59 deletions
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 94ebeff9..914f2884 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -11,6 +11,90 @@ import fatcat_client
from .common import EntityImporter, clean, is_cjk, DATE_FMT
+def parse_jalc_persons(raw_persons):
+ """
+ For the most part, JALC DC names are in either japanese or english. The
+ two common patterns are a list alternating between the two (in which case
+ the names are translations), or all in one language or the other.
+
+ Because dublin core is a projection tossing away a bunch of context, the
+ other cases are hard to disambiguate. There are also some cases with Korean
+ and other languages mixed in. This crude method doesn't handle everything
+ right; it tries to just get the two common patterns correct. Sorry humans!
+
+ Edge cases for this function:
+ - 10.11316/jpsgaiyo.56.1.4.0_757_3 <= all english, some japanese, works
+ - 10.14988/pa.2017.0000013531 <= complex, not japanese/english, mixed
+ - 10.15036/arerugi.62.1407_1 <= one japanese, two english; fails
+ - 10.14988/pa.2017.0000007327 <= ambiguous; translator in jpn/eng
+ """
+
+ persons = []
+
+ # first parse out into language-agnostic dics
+ for raw in raw_persons:
+ name = raw.find('name') or None
+ if name:
+ name = clean(name.string)
+ surname = raw.find('familyName') or None
+ if surname:
+ surname = clean(surname.string)
+ given_name = raw.find('givenName') or None
+ if given_name:
+ given_name = clean(given_name.string)
+ lang = 'en'
+ if is_cjk(name):
+ lang = 'ja'
+ if lang == 'en' and surname and given_name:
+ # english names order is flipped
+ name = "{} {}".format(given_name, surname)
+ rc = fatcat_client.ReleaseContrib(
+ raw_name=name,
+ surname=surname,
+ given_name=given_name,
+ role="author")
+ # add an extra hint field; won't end up in serialized object
+ rc._lang = lang
+ persons.append(rc)
+
+ if not persons:
+ return []
+
+ if all([p._lang == 'en' for p in persons]) or all([p._lang == 'ja' for p in persons]):
+ # all english names, or all japanese names
+ return persons
+
+ if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']):
+ print("INTERESTING: {}".format(persons[0]))
+
+ start_lang = persons[0]._lang
+ contribs = []
+ for p in persons:
+ if p._lang == start_lang:
+ contribs.append(p)
+ else:
+ if p._lang == 'en' and contribs[-1]._lang == 'ja':
+ eng = p
+ jpn = contribs[-1]
+ elif p._lang == 'ja' and contribs[-1]._lang == 'en':
+ eng = contribs[-1]
+ jpn = p
+ else:
+ # give up and just add as another author
+ contribs.append(p)
+ continue
+ eng.extra = {
+ 'original_name': {
+ 'lang': jpn._lang,
+ 'raw_name': jpn.raw_name,
+ 'given_name': jpn.given_name,
+ 'surname': jpn.surname,
+ },
+ }
+ contribs[-1] = eng
+ return contribs
+
+
class JalcImporter(EntityImporter):
"""
Importer for JALC DOI metadata.
@@ -104,66 +188,8 @@ class JalcImporter(EntityImporter):
if not doi:
return None
- contribs = []
people = record.find_all("Person")
- if (people and (len(people) % 2 == 0)
- and not is_cjk(people[0].find('name').string)
- and is_cjk(people[1].find('name').string)):
- # both english and japanese names are usually included for every author
- # TODO: turns out this isn't always the case; see
- # 10.18948/shasetaikai.1990.0_601 as an example with 4 actual
- # authors, but 5 Person entries; all 4 authors in japanese, a
- # single author in both japanese in english. Ugh!
- for i in range(int(len(people)/2)):
- eng = people[i*2]
- jpn = people[i*2 + 1]
- # there isn't always an english name though? TODO
- name = eng
- if not name.find('name'):
- name = jpn
- surname = name.find('familyName')
- if surname:
- surname = surname.string
- given_name = name.find('givenName')
- if given_name:
- given_name = given_name.string
- contrib = fatcat_client.ReleaseContrib(
- raw_name=clean(name.find('name').string),
- given_name=clean(given_name),
- surname=clean(surname),
- role='author',
- )
- if eng.find('name') and jpn.find('name'):
- surname = jpn.find('familyName')
- if surname:
- surname = surname.string
- given_name = jpn.find('givenName')
- if given_name:
- given_name = given_name.string
- contrib.extra = {
- 'original_name': {
- 'lang': 'ja',
- 'raw_name': clean(jpn.find('name').string),
- 'given_name': clean(given_name),
- 'surname': clean(surname),
- }}
- contribs.append(contrib)
- elif people:
- # TODO: test for this codepath?
- for eng in people:
- surname = eng.find('familyName')
- if surname:
- surname = surname.string
- given_name = eng.find('givenName')
- if given_name:
- given_name = given_name.string
- contrib = fatcat_client.ReleaseContrib(
- raw_name=clean(eng.find('name').string),
- given_name=clean(given_name),
- surname=clean(surname),
- role='author',
- )
- contribs.append(contrib)
+ contribs = parse_jalc_persons(people)
for i, contrib in enumerate(contribs):
if contrib.raw_name != "et al.":