diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 18:13:20 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 18:13:20 -0700 |
commit | 0efc5bb015f3b00affc22662740429a1652b3064 (patch) | |
tree | 646aa4205d8b7f995fe9f6ebef486dbda685fb96 /python/fatcat_tools | |
parent | bf07be0e0501d15fb9b31dbf696f95c56b16f4f1 (diff) | |
download | fatcat-0efc5bb015f3b00affc22662740429a1652b3064.tar.gz fatcat-0efc5bb015f3b00affc22662740429a1652b3064.zip |
more JALC importer tweaks
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/jalc.py | 17 |
1 files changed, 10 insertions, 7 deletions
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 2b019b3d..81c2b7f8 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -95,7 +95,11 @@ class JalcImporter(EntityImporter): contribs = [] people = record.find_all("Person") if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string): - # both english and japanese names are included for every author + # both english and japanese names are usually included for every author + # TODO: turns out this isn't always the case; see + # 10.18948/shasetaikai.1990.0_601 as an example with 4 actual + # authors, but 5 Person entries; all 4 authors in japanese, a + # single author in both japanese in english. Ugh! for i in range(int(len(people)/2)): eng = people[i*2] jpn = people[i*2 + 1] @@ -181,16 +185,15 @@ class JalcImporter(EntityImporter): if record.publicationName: pubs = [p.string.strip() for p in record.find_all("publicationName")] - pubs = [p for p in pubs if p] + pubs = [clean(p) for p in pubs if p] assert(pubs) if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] - elif len(pubs) > 1 and is_cjk(pubs[0]): - # ordering is not reliable + if len(pubs) > 1 and is_cjk(pubs[0]): + # eng/jpn ordering is not reliable pubs = [pubs[1], pubs[0]] container_name = clean(pubs[0]) if len(pubs) > 1: - orig_container_name = pubs[1] container_extra['original_name'] = clean(pubs[1]) if record.publisher: @@ -198,12 +201,12 @@ class JalcImporter(EntityImporter): pubs = [p for p in pubs if p] if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] - elif len(pubs) > 1 and is_cjk(pubs[0]): + if len(pubs) > 1 and is_cjk(pubs[0]): # ordering is not reliable pubs = [pubs[1], pubs[0]] publisher = clean(pubs[0]) if len(pubs) > 1: - container_extra['publisher_alt_name'] = pubs[1] + container_extra['publisher_aliases'] = pubs[1:] if (container_id is None and self.create_containers and (issnl is not None) and container_name): |