diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 20:39:02 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 20:39:02 -0700 |
commit | 5a508d61daf23a4bfa337c4229bbb6795b69fbd2 (patch) | |
tree | 3a8e744411c2db215d666cd600d4679c5a16e9a9 /fatcat_scholar/schema.py | |
parent | c71314e46dcf18905d1957579a211bb47c520d57 (diff) | |
download | fatcat-scholar-5a508d61daf23a4bfa337c4229bbb6795b69fbd2.tar.gz fatcat-scholar-5a508d61daf23a4bfa337c4229bbb6795b69fbd2.zip |
fixes from manual testing
Diffstat (limited to 'fatcat_scholar/schema.py')
-rw-r--r-- | fatcat_scholar/schema.py | 18 |
1 files changed, 11 insertions, 7 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index aa4ed52..74c80c8 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -186,20 +186,23 @@ def scrub_text(raw: str, mimetype: str = None) -> str: TODO: barely implemented yet """ if "<jats" in raw or (mimetype and "application/xml" in mimetype): - root = ET.fromstring(raw) - raw = " ".join(list(root.itertext())) or "" + try: + root = ET.fromstring(raw) + raw = " ".join(list(root.itertext())) or "" + except: + pass raw = ftfy.fix_text(raw) assert raw, "Empty abstract" return raw def contrib_name(contrib: ReleaseContrib) -> str: # TODO: support more cultural normals for name presentation - if contrib.given_name and contrib.family_name: - return f"{contrib.given_name} {contrib.family_name}" + if contrib.given_name and contrib.surname: + return f"{contrib.given_name} {contrib.surname}" elif contrib.raw_name: return contrib.raw_name - elif contrib.family_name: - return contrib.family_name + elif contrib.surname: + return contrib.surname else: return contrib.given_name @@ -287,7 +290,8 @@ def es_biblio_from_release(release: ReleaseEntity) -> ScholarBiblio: container_issnl=container_issnl, issns=issns, - contrib_names=[contrib_name(c) for c in release.contribs if c.index], + # TODO; these filters sort of meh. refactor to be above? + contrib_names=list(filter(lambda x: bool(x), [contrib_name(c) for c in release.contribs if c.index])), contrib_count = len([c for c in release.contribs if c.index]), affiliations=list(filter(lambda x: bool(x), [contrib_affiliation(c) for c in release.contribs if c.index])), ) |