summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-21 18:13:20 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 18:13:20 -0700
commit0efc5bb015f3b00affc22662740429a1652b3064 (patch)
tree646aa4205d8b7f995fe9f6ebef486dbda685fb96 /python/fatcat_tools/importers
parentbf07be0e0501d15fb9b31dbf696f95c56b16f4f1 (diff)
downloadfatcat-0efc5bb015f3b00affc22662740429a1652b3064.tar.gz
fatcat-0efc5bb015f3b00affc22662740429a1652b3064.zip
more JALC importer tweaks
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/jalc.py17
1 files changed, 10 insertions, 7 deletions
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 2b019b3d..81c2b7f8 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -95,7 +95,11 @@ class JalcImporter(EntityImporter):
contribs = []
people = record.find_all("Person")
if people and (len(people) % 2 == 0) and is_cjk(people[1].find('name').string):
- # both english and japanese names are included for every author
+ # both english and japanese names are usually included for every author
+ # TODO: turns out this isn't always the case; see
+ # 10.18948/shasetaikai.1990.0_601 as an example with 4 actual
+ # authors, but 5 Person entries; all 4 authors in japanese, a
+ # single author in both japanese in english. Ugh!
for i in range(int(len(people)/2)):
eng = people[i*2]
jpn = people[i*2 + 1]
@@ -181,16 +185,15 @@ class JalcImporter(EntityImporter):
if record.publicationName:
pubs = [p.string.strip() for p in record.find_all("publicationName")]
- pubs = [p for p in pubs if p]
+ pubs = [clean(p) for p in pubs if p]
assert(pubs)
if len(pubs) > 1 and pubs[0] == pubs[1]:
pubs = [pubs[0]]
- elif len(pubs) > 1 and is_cjk(pubs[0]):
- # ordering is not reliable
+ if len(pubs) > 1 and is_cjk(pubs[0]):
+ # eng/jpn ordering is not reliable
pubs = [pubs[1], pubs[0]]
container_name = clean(pubs[0])
if len(pubs) > 1:
- orig_container_name = pubs[1]
container_extra['original_name'] = clean(pubs[1])
if record.publisher:
@@ -198,12 +201,12 @@ class JalcImporter(EntityImporter):
pubs = [p for p in pubs if p]
if len(pubs) > 1 and pubs[0] == pubs[1]:
pubs = [pubs[0]]
- elif len(pubs) > 1 and is_cjk(pubs[0]):
+ if len(pubs) > 1 and is_cjk(pubs[0]):
# ordering is not reliable
pubs = [pubs[1], pubs[0]]
publisher = clean(pubs[0])
if len(pubs) > 1:
- container_extra['publisher_alt_name'] = pubs[1]
+ container_extra['publisher_aliases'] = pubs[1:]
if (container_id is None and self.create_containers and (issnl is not None)
and container_name):