From 0efc5bb015f3b00affc22662740429a1652b3064 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 21 May 2019 18:13:20 -0700 Subject: more JALC importer tweaks --- python/fatcat_tools/importers/jalc.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 2b019b3d..81c2b7f8 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -95,7 +95,11 @@ class JalcImporter(EntityImporter): contribs = [] people = record.find_all("Person") if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string): - # both english and japanese names are included for every author + # both english and japanese names are usually included for every author + # TODO: turns out this isn't always the case; see + # 10.18948/shasetaikai.1990.0_601 as an example with 4 actual + # authors, but 5 Person entries; all 4 authors in japanese, a + # single author in both japanese in english. Ugh! for i in range(int(len(people)/2)): eng = people[i*2] jpn = people[i*2 + 1] @@ -181,16 +185,15 @@ class JalcImporter(EntityImporter): if record.publicationName: pubs = [p.string.strip() for p in record.find_all("publicationName")] - pubs = [p for p in pubs if p] + pubs = [clean(p) for p in pubs if p] assert(pubs) if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] - elif len(pubs) > 1 and is_cjk(pubs[0]): - # ordering is not reliable + if len(pubs) > 1 and is_cjk(pubs[0]): + # eng/jpn ordering is not reliable pubs = [pubs[1], pubs[0]] container_name = clean(pubs[0]) if len(pubs) > 1: - orig_container_name = pubs[1] container_extra['original_name'] = clean(pubs[1]) if record.publisher: @@ -198,12 +201,12 @@ class JalcImporter(EntityImporter): pubs = [p for p in pubs if p] if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] - elif len(pubs) > 1 and is_cjk(pubs[0]): + if len(pubs) > 1 and is_cjk(pubs[0]): # ordering is not reliable pubs = [pubs[1], pubs[0]] publisher = clean(pubs[0]) if len(pubs) > 1: - container_extra['publisher_alt_name'] = pubs[1] + container_extra['publisher_aliases'] = pubs[1:] if (container_id is None and self.create_containers and (issnl is not None) and container_name): -- cgit v1.2.3