From 5aeb5f79d83a2559671fed6d9afed2b0987139b4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 23 Jan 2019 15:33:44 -0800 Subject: ftfy all over (needs Pipfile.lock) --- python/Pipfile | 1 + python/fatcat_tools/importers/__init__.py | 2 +- python/fatcat_tools/importers/common.py | 31 +++++++++++++++++ python/fatcat_tools/importers/crossref.py | 41 ++++++++++++----------- python/fatcat_tools/importers/grobid_metadata.py | 23 +++++++------ python/fatcat_tools/importers/journal_metadata.py | 6 ++-- python/fatcat_tools/importers/matched.py | 2 +- python/fatcat_tools/importers/orcid.py | 8 ++--- 8 files changed, 75 insertions(+), 39 deletions(-) diff --git a/python/Pipfile b/python/Pipfile index eebdab36..b04bb91a 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -32,6 +32,7 @@ python-dateutil = "*" sickle = "*" python-snappy = "*" pymacaroons = "*" +ftfy= "*" [requires] # Python 3.5 is the bundled (system) version of python for Ubuntu 16.04 diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index b709f714..70f38f5b 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -12,7 +12,7 @@ To run an import you combine two classes; one each of: """ -from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer, clean from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP from .grobid_metadata import GrobidMetadataImporter from .journal_metadata import JournalMetadataImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 2d5c89b3..1c99c7d7 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -3,6 +3,7 @@ import re import sys import csv import json +import ftfy import itertools import subprocess from collections import Counter @@ -12,6 +13,36 @@ import fatcat_client from fatcat_client.rest import ApiException +def clean(thing, force_xml=False): + """ + This function is appropriate to be called on any random, non-markup string, + such as author names, titles, etc. + + It will try to clean up commong unicode mangles, HTML characters, etc. + + This will detect XML/HTML and "do the right thing" (aka, not remove + entities like '&' if there are tags in the string), unless you pass the + 'force_xml' parameter, which might be appropriate for, eg, names and + titles, which generally should be projected down to plain text. + + Also strips extra whitespace. + """ + if not thing: + return thing + fix_entities = 'auto' + if force_xml: + fix_entities = True + return ftfy.fix_text(thing, fix_entities=fix_entities).strip() + +def test_clean(): + + assert clean(None) == None + assert clean('') == '' + assert clean('123') == '123' + assert clean('a&b') == 'a&b' + assert clean('a&b') == 'a&b' + assert clean('a&b', force_xml=True) == 'a&b' + class EntityImporter: """ Base class for fatcat entity importers. diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 22abd08d..cbb6deb5 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -6,7 +6,7 @@ import datetime import itertools import subprocess import fatcat_client -from .common import EntityImporter +from .common import EntityImporter, clean # The docs/guide should be the cannonical home for these mappings; update there @@ -169,7 +169,7 @@ class CrossrefImporter(EntityImporter): raw_affiliation = am.get('affiliation')[0]['name'] if len(am.get('affiliation')) > 1: # note: affiliation => affiliations - extra['affiliations'] = [a['name'] for a in am.get('affiliation')[1:]] + extra['affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]] if am.get('sequence') and am.get('sequence') != "additional": extra['sequence'] = am.get('sequence') if not extra: @@ -178,8 +178,8 @@ class CrossrefImporter(EntityImporter): contribs.append(fatcat_client.ReleaseContrib( creator_id=creator_id, index=index, - raw_name=raw_name, - raw_affiliation=raw_affiliation, + raw_name=clean(raw_name), + raw_affiliation=clean(raw_affiliation), role=ctype, extra=extra)) return contribs @@ -199,9 +199,9 @@ class CrossrefImporter(EntityImporter): and obj.get('container-title') and len(obj['container-title']) > 0): ce = fatcat_client.ContainerEntity( issnl=issnl, - publisher=publisher, + publisher=clean(publisher), container_type=self.map_container_type(release_type), - name=obj['container-title'][0]) + name=clean(obj['container-title'][0], force_xml=True)) ce_edit = self.create_container(ce) container_id = ce_edit.ident @@ -257,10 +257,10 @@ class CrossrefImporter(EntityImporter): # doing lookups would be a second import pass target_release_id=None, key=key, - year=year, - container_name=container_name, - title=ref_title, - locator=ref_locator, + year=clean(year), + container_name=clean(container_name), + title=clean(ref_title), + locator=clean(ref_locator), # TODO: just dump JSON somewhere here? extra=extra)) @@ -269,7 +269,7 @@ class CrossrefImporter(EntityImporter): if obj.get('abstract') != None: abstracts.append(fatcat_client.ReleaseEntityAbstracts( mimetype="application/xml+jats", - content=obj.get('abstract'))) + content=clean(obj.get('abstract')))) # extra fields extra = dict() @@ -279,13 +279,16 @@ class CrossrefImporter(EntityImporter): # TODO: unpack "container-title" array val = obj.get(key) if val: - extra[key] = val + if type(val) == str: + extra[key] = clean(val) + else: + extra[key] = val if 'license' in extra and extra['license']: for i in range(len(extra['license'])): if 'start' in extra['license'][i]: extra['license'][i]['start'] = extra['license'][i]['start']['date-time'] if len(obj['title']) > 1: - extra['other-titles'] = obj['title'][1:] + extra['other-titles'] = [clean(t) for t in obj['title'][1:]] # TODO: this should be top-level extra['is_kept'] = len(obj.get('archive', [])) > 0 @@ -329,13 +332,13 @@ class CrossrefImporter(EntityImporter): re = fatcat_client.ReleaseEntity( work_id=None, container_id=container_id, - title=obj.get('title', [None])[0], - original_title=obj.get('original-title', [None])[0], + title=clean(obj.get('title', [None])[0], force_xml=True), + original_title=clean(obj.get('original-title', [None])[0]), release_type=release_type, release_status=release_status, release_date=release_date, release_year=release_year, - publisher=publisher, + publisher=clean(publisher), doi=obj['DOI'].lower(), pmid=extids['pmid'], pmcid=extids['pmcid'], @@ -344,9 +347,9 @@ class CrossrefImporter(EntityImporter): core_id=extids['core_id'], arxiv_id=extids['arxiv_id'], jstor_id=extids['jstor_id'], - volume=obj.get('volume'), - issue=obj.get('issue'), - pages=obj.get('page'), + volume=clean(obj.get('volume')), + issue=clean(obj.get('issue')), + pages=clean(obj.get('page')), language=None, # crossref doesn't supply language info license_slug=license_slug, extra=dict(crossref=extra), diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 4d3b41bc..468b0ede 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -5,7 +5,7 @@ import json import base64 import datetime import fatcat_client -from .common import EntityImporter +from .common import EntityImporter, clean MAX_ABSTRACT_BYTES=4096 @@ -82,7 +82,7 @@ class GrobidMetadataImporter(EntityImporter): abobj = dict( mimetype="text/plain", language=None, - content=obj.get('abstract').strip()) + content=clean(obj.get('abstract'))) abstracts = [abobj] else: abstracts = None @@ -91,17 +91,18 @@ class GrobidMetadataImporter(EntityImporter): for i, a in enumerate(obj.get('authors', [])): contribs.append(fatcat_client.ReleaseContrib( index=i, - raw_name=a['name'], + raw_name=clean(a['name']), role="author", extra=None)) + # XXX: why is this a dict()? not covered by tests? refs = [] for raw in obj.get('citations', []): cite_extra = dict() ref = dict() - ref['key'] = raw.get('id') + ref['key'] = clean(raw.get('id')) if raw.get('title'): - ref['title'] = raw['title'].strip() + ref['title'] = clean(raw['title']) if raw.get('date'): try: year = int(raw['date'].strip()[:4]) @@ -110,9 +111,9 @@ class GrobidMetadataImporter(EntityImporter): pass for key in ('volume', 'url', 'issue', 'publisher'): if raw.get(key): - cite_extra[key] = raw[key].strip() + cite_extra[key] = clean(raw[key]) if raw.get('authors'): - cite_extra['authors'] = [a['name'] for a in raw['authors']] + cite_extra['authors'] = [clean(a['name']) for a in raw['authors']] if cite_extra: cite_extra = dict(grobid=cite_extra) else: @@ -141,15 +142,15 @@ class GrobidMetadataImporter(EntityImporter): extra = None re = fatcat_client.ReleaseEntity( - title=obj['title'].strip(), + title=clean(obj['title'], force_xml=True), release_type="article-journal", release_date=release_date, release_year=release_year, contribs=contribs, refs=refs, - publisher=obj['journal'].get('publisher'), - volume=obj['journal'].get('volume'), - issue=obj['journal'].get('issue'), + publisher=clean(obj['journal'].get('publisher')), + volume=clean(obj['journal'].get('volume')), + issue=clean(obj['journal'].get('issue')), abstracts=abstracts, extra=extra) return re diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index ff38cd77..ccdb7ec6 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -3,7 +3,7 @@ import sys import json import itertools import fatcat_client -from .common import EntityImporter +from .common import EntityImporter, clean def or_none(s): @@ -72,8 +72,8 @@ class JournalMetadataImporter(EntityImporter): ) ce = fatcat_client.ContainerEntity( issnl=issnl, - name=title, - publisher=or_none(row['publisher']), + name=clean(title), + publisher=or_none(clean(row['publisher'])), extra=extra) return ce diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 2be15860..055f9c6a 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -4,7 +4,7 @@ import json import sqlite3 import itertools import fatcat_client -from .common import EntityImporter +from .common import EntityImporter, clean class MatchedImporter(EntityImporter): diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 2c39db18..02c9bf00 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -3,7 +3,7 @@ import sys import json import itertools import fatcat_client -from .common import EntityImporter +from .common import EntityImporter, clean def value_or_none(e): if type(e) == dict: @@ -63,9 +63,9 @@ class OrcidImporter(EntityImporter): return None ce = fatcat_client.CreatorEntity( orcid=orcid, - given_name=given, - surname=sur, - display_name=display, + given_name=clean(given), + surname=clean(sur), + display_name=clean(display), extra=extra) return ce -- cgit v1.2.3