aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/schema.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-20 20:39:02 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-20 20:39:02 -0700
commit5a508d61daf23a4bfa337c4229bbb6795b69fbd2 (patch)
tree3a8e744411c2db215d666cd600d4679c5a16e9a9 /fatcat_scholar/schema.py
parentc71314e46dcf18905d1957579a211bb47c520d57 (diff)
downloadfatcat-scholar-5a508d61daf23a4bfa337c4229bbb6795b69fbd2.tar.gz
fatcat-scholar-5a508d61daf23a4bfa337c4229bbb6795b69fbd2.zip
fixes from manual testing
Diffstat (limited to 'fatcat_scholar/schema.py')
-rw-r--r--fatcat_scholar/schema.py18
1 files changed, 11 insertions, 7 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index aa4ed52..74c80c8 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -186,20 +186,23 @@ def scrub_text(raw: str, mimetype: str = None) -> str:
TODO: barely implemented yet
"""
if "<jats" in raw or (mimetype and "application/xml" in mimetype):
- root = ET.fromstring(raw)
- raw = " ".join(list(root.itertext())) or ""
+ try:
+ root = ET.fromstring(raw)
+ raw = " ".join(list(root.itertext())) or ""
+ except:
+ pass
raw = ftfy.fix_text(raw)
assert raw, "Empty abstract"
return raw
def contrib_name(contrib: ReleaseContrib) -> str:
# TODO: support more cultural normals for name presentation
- if contrib.given_name and contrib.family_name:
- return f"{contrib.given_name} {contrib.family_name}"
+ if contrib.given_name and contrib.surname:
+ return f"{contrib.given_name} {contrib.surname}"
elif contrib.raw_name:
return contrib.raw_name
- elif contrib.family_name:
- return contrib.family_name
+ elif contrib.surname:
+ return contrib.surname
else:
return contrib.given_name
@@ -287,7 +290,8 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio:
container_issnl=container_issnl,
issns=issns,
- contrib_names=[contrib_name(c) for c in release.contribs if c.index],
+ # TODO; these filters sort of meh. refactor to be above?
+ contrib_names=list(filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs if c.index])),
contrib_count = len([c for c in release.contribs if c.index]),
affiliations=list(filter(lambda x: bool(x), [contrib_affiliation(c) for c in release.contribs if c.index])),
)