diff options
| -rw-r--r-- | python/Pipfile | 1 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 31 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/crossref.py | 41 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 23 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/journal_metadata.py | 6 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/matched.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/orcid.py | 8 | 
8 files changed, 75 insertions, 39 deletions
| diff --git a/python/Pipfile b/python/Pipfile index eebdab36..b04bb91a 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -32,6 +32,7 @@ python-dateutil = "*"  sickle = "*"  python-snappy = "*"  pymacaroons = "*" +ftfy= "*"  [requires]  # Python 3.5 is the bundled (system) version of python for Ubuntu 16.04 diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index b709f714..70f38f5b 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -12,7 +12,7 @@ To run an import you combine two classes; one each of:  """ -from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer, clean  from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP  from .grobid_metadata import GrobidMetadataImporter  from .journal_metadata import JournalMetadataImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 2d5c89b3..1c99c7d7 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -3,6 +3,7 @@ import re  import sys  import csv  import json +import ftfy  import itertools  import subprocess  from collections import Counter @@ -12,6 +13,36 @@ import fatcat_client  from fatcat_client.rest import ApiException +def clean(thing, force_xml=False): +    """ +    This function is appropriate to be called on any random, non-markup string, +    such as author names, titles, etc. + +    It will try to clean up commong unicode mangles, HTML characters, etc. + +    This will detect XML/HTML and "do the right thing" (aka, not remove +    entities like '&' if there are tags in the string), unless you pass the +    'force_xml' parameter, which might be appropriate for, eg, names and +    titles, which generally should be projected down to plain text. + +    Also strips extra whitespace. +    """ +    if not thing: +        return thing +    fix_entities = 'auto' +    if force_xml: +        fix_entities = True +    return ftfy.fix_text(thing, fix_entities=fix_entities).strip() + +def test_clean(): + +    assert clean(None) == None +    assert clean('') == '' +    assert clean('123') == '123' +    assert clean('a&b') == 'a&b' +    assert clean('<b>a&b</b>') == '<b>a&b</b>' +    assert clean('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' +  class EntityImporter:      """      Base class for fatcat entity importers. diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 22abd08d..cbb6deb5 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -6,7 +6,7 @@ import datetime  import itertools  import subprocess  import fatcat_client -from .common import EntityImporter +from .common import EntityImporter, clean  # The docs/guide should be the cannonical home for these mappings; update there @@ -169,7 +169,7 @@ class CrossrefImporter(EntityImporter):                          raw_affiliation = am.get('affiliation')[0]['name']                      if len(am.get('affiliation')) > 1:                          # note: affiliation => affiliations -                        extra['affiliations'] = [a['name'] for a in am.get('affiliation')[1:]] +                        extra['affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]                  if am.get('sequence') and am.get('sequence') != "additional":                      extra['sequence'] = am.get('sequence')                  if not extra: @@ -178,8 +178,8 @@ class CrossrefImporter(EntityImporter):                  contribs.append(fatcat_client.ReleaseContrib(                      creator_id=creator_id,                      index=index, -                    raw_name=raw_name, -                    raw_affiliation=raw_affiliation, +                    raw_name=clean(raw_name), +                    raw_affiliation=clean(raw_affiliation),                      role=ctype,                      extra=extra))              return contribs @@ -199,9 +199,9 @@ class CrossrefImporter(EntityImporter):              and obj.get('container-title') and len(obj['container-title']) > 0):              ce = fatcat_client.ContainerEntity(                  issnl=issnl, -                publisher=publisher, +                publisher=clean(publisher),                  container_type=self.map_container_type(release_type), -                name=obj['container-title'][0]) +                name=clean(obj['container-title'][0], force_xml=True))              ce_edit = self.create_container(ce)              container_id = ce_edit.ident @@ -257,10 +257,10 @@ class CrossrefImporter(EntityImporter):                  # doing lookups would be a second import pass                  target_release_id=None,                  key=key, -                year=year, -                container_name=container_name, -                title=ref_title, -                locator=ref_locator, +                year=clean(year), +                container_name=clean(container_name), +                title=clean(ref_title), +                locator=clean(ref_locator),                  # TODO: just dump JSON somewhere here?                  extra=extra)) @@ -269,7 +269,7 @@ class CrossrefImporter(EntityImporter):          if obj.get('abstract') != None:              abstracts.append(fatcat_client.ReleaseEntityAbstracts(                  mimetype="application/xml+jats", -                content=obj.get('abstract'))) +                content=clean(obj.get('abstract'))))          # extra fields          extra = dict() @@ -279,13 +279,16 @@ class CrossrefImporter(EntityImporter):              # TODO: unpack "container-title" array              val = obj.get(key)              if val: -                extra[key] = val +                if type(val) == str: +                    extra[key] = clean(val) +                else: +                    extra[key] = val          if 'license' in extra and extra['license']:              for i in range(len(extra['license'])):                  if 'start' in extra['license'][i]:                      extra['license'][i]['start'] = extra['license'][i]['start']['date-time']          if len(obj['title']) > 1: -            extra['other-titles'] = obj['title'][1:] +            extra['other-titles'] = [clean(t) for t in obj['title'][1:]]          # TODO: this should be top-level          extra['is_kept'] = len(obj.get('archive', [])) > 0 @@ -329,13 +332,13 @@ class CrossrefImporter(EntityImporter):          re = fatcat_client.ReleaseEntity(              work_id=None,              container_id=container_id, -            title=obj.get('title', [None])[0], -            original_title=obj.get('original-title', [None])[0], +            title=clean(obj.get('title', [None])[0], force_xml=True), +            original_title=clean(obj.get('original-title', [None])[0]),              release_type=release_type,              release_status=release_status,              release_date=release_date,              release_year=release_year, -            publisher=publisher, +            publisher=clean(publisher),              doi=obj['DOI'].lower(),              pmid=extids['pmid'],              pmcid=extids['pmcid'], @@ -344,9 +347,9 @@ class CrossrefImporter(EntityImporter):              core_id=extids['core_id'],              arxiv_id=extids['arxiv_id'],              jstor_id=extids['jstor_id'], -            volume=obj.get('volume'), -            issue=obj.get('issue'), -            pages=obj.get('page'), +            volume=clean(obj.get('volume')), +            issue=clean(obj.get('issue')), +            pages=clean(obj.get('page')),              language=None,  # crossref doesn't supply language info              license_slug=license_slug,              extra=dict(crossref=extra), diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 4d3b41bc..468b0ede 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -5,7 +5,7 @@ import json  import base64  import datetime  import fatcat_client -from .common import EntityImporter +from .common import EntityImporter, clean  MAX_ABSTRACT_BYTES=4096 @@ -82,7 +82,7 @@ class GrobidMetadataImporter(EntityImporter):              abobj = dict(                  mimetype="text/plain",                  language=None, -                content=obj.get('abstract').strip()) +                content=clean(obj.get('abstract')))              abstracts = [abobj]          else:              abstracts = None @@ -91,17 +91,18 @@ class GrobidMetadataImporter(EntityImporter):          for i, a in enumerate(obj.get('authors', [])):              contribs.append(fatcat_client.ReleaseContrib(                  index=i, -                raw_name=a['name'], +                raw_name=clean(a['name']),                  role="author",                  extra=None)) +        # XXX: why is this a dict()? not covered by tests?          refs = []          for raw in obj.get('citations', []):              cite_extra = dict()              ref = dict() -            ref['key'] = raw.get('id') +            ref['key'] = clean(raw.get('id'))              if raw.get('title'): -                ref['title'] = raw['title'].strip() +                ref['title'] = clean(raw['title'])              if raw.get('date'):                  try:                      year = int(raw['date'].strip()[:4]) @@ -110,9 +111,9 @@ class GrobidMetadataImporter(EntityImporter):                      pass              for key in ('volume', 'url', 'issue', 'publisher'):                  if raw.get(key): -                    cite_extra[key] = raw[key].strip() +                    cite_extra[key] = clean(raw[key])              if raw.get('authors'): -                cite_extra['authors'] = [a['name'] for a in raw['authors']] +                cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]              if cite_extra:                  cite_extra = dict(grobid=cite_extra)              else: @@ -141,15 +142,15 @@ class GrobidMetadataImporter(EntityImporter):              extra = None          re = fatcat_client.ReleaseEntity( -            title=obj['title'].strip(), +            title=clean(obj['title'], force_xml=True),              release_type="article-journal",              release_date=release_date,              release_year=release_year,              contribs=contribs,              refs=refs, -            publisher=obj['journal'].get('publisher'), -            volume=obj['journal'].get('volume'), -            issue=obj['journal'].get('issue'), +            publisher=clean(obj['journal'].get('publisher')), +            volume=clean(obj['journal'].get('volume')), +            issue=clean(obj['journal'].get('issue')),              abstracts=abstracts,              extra=extra)          return re diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index ff38cd77..ccdb7ec6 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -3,7 +3,7 @@ import sys  import json  import itertools  import fatcat_client -from .common import EntityImporter +from .common import EntityImporter, clean  def or_none(s): @@ -72,8 +72,8 @@ class JournalMetadataImporter(EntityImporter):          )          ce = fatcat_client.ContainerEntity(              issnl=issnl, -            name=title, -            publisher=or_none(row['publisher']), +            name=clean(title), +            publisher=or_none(clean(row['publisher'])),              extra=extra)          return ce diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 2be15860..055f9c6a 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -4,7 +4,7 @@ import json  import sqlite3  import itertools  import fatcat_client -from .common import EntityImporter +from .common import EntityImporter, clean  class MatchedImporter(EntityImporter): diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 2c39db18..02c9bf00 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -3,7 +3,7 @@ import sys  import json  import itertools  import fatcat_client -from .common import EntityImporter +from .common import EntityImporter, clean  def value_or_none(e):      if type(e) == dict: @@ -63,9 +63,9 @@ class OrcidImporter(EntityImporter):              return None          ce = fatcat_client.CreatorEntity(              orcid=orcid, -            given_name=given, -            surname=sur, -            display_name=display, +            given_name=clean(given), +            surname=clean(sur), +            display_name=clean(display),              extra=extra)          return ce | 
