aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-01-23 15:33:44 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-01-23 15:33:44 -0800
commit5aeb5f79d83a2559671fed6d9afed2b0987139b4 (patch)
tree3a4cbec2e5307f8b84b15fb703dbc62547a31154
parent1cc4f517390d6cb09155746778a0ae566c9725c7 (diff)
downloadfatcat-5aeb5f79d83a2559671fed6d9afed2b0987139b4.tar.gz
fatcat-5aeb5f79d83a2559671fed6d9afed2b0987139b4.zip
ftfy all over (needs Pipfile.lock)
-rw-r--r--python/Pipfile1
-rw-r--r--python/fatcat_tools/importers/__init__.py2
-rw-r--r--python/fatcat_tools/importers/common.py31
-rw-r--r--python/fatcat_tools/importers/crossref.py41
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py23
-rw-r--r--python/fatcat_tools/importers/journal_metadata.py6
-rw-r--r--python/fatcat_tools/importers/matched.py2
-rw-r--r--python/fatcat_tools/importers/orcid.py8
8 files changed, 75 insertions, 39 deletions
diff --git a/python/Pipfile b/python/Pipfile
index eebdab36..b04bb91a 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -32,6 +32,7 @@ python-dateutil = "*"
sickle = "*"
python-snappy = "*"
pymacaroons = "*"
+ftfy= "*"
[requires]
# Python 3.5 is the bundled (system) version of python for Ubuntu 16.04
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index b709f714..70f38f5b 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -12,7 +12,7 @@ To run an import you combine two classes; one each of:
"""
-from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer, clean
from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
from .grobid_metadata import GrobidMetadataImporter
from .journal_metadata import JournalMetadataImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 2d5c89b3..1c99c7d7 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -3,6 +3,7 @@ import re
import sys
import csv
import json
+import ftfy
import itertools
import subprocess
from collections import Counter
@@ -12,6 +13,36 @@ import fatcat_client
from fatcat_client.rest import ApiException
+def clean(thing, force_xml=False):
+ """
+ This function is appropriate to be called on any random, non-markup string,
+ such as author names, titles, etc.
+
+ It will try to clean up commong unicode mangles, HTML characters, etc.
+
+ This will detect XML/HTML and "do the right thing" (aka, not remove
+ entities like '&amp' if there are tags in the string), unless you pass the
+ 'force_xml' parameter, which might be appropriate for, eg, names and
+ titles, which generally should be projected down to plain text.
+
+ Also strips extra whitespace.
+ """
+ if not thing:
+ return thing
+ fix_entities = 'auto'
+ if force_xml:
+ fix_entities = True
+ return ftfy.fix_text(thing, fix_entities=fix_entities).strip()
+
+def test_clean():
+
+ assert clean(None) == None
+ assert clean('') == ''
+ assert clean('123') == '123'
+ assert clean('a&amp;b') == 'a&b'
+ assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
+ assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+
class EntityImporter:
"""
Base class for fatcat entity importers.
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 22abd08d..cbb6deb5 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -6,7 +6,7 @@ import datetime
import itertools
import subprocess
import fatcat_client
-from .common import EntityImporter
+from .common import EntityImporter, clean
# The docs/guide should be the cannonical home for these mappings; update there
@@ -169,7 +169,7 @@ class CrossrefImporter(EntityImporter):
raw_affiliation = am.get('affiliation')[0]['name']
if len(am.get('affiliation')) > 1:
# note: affiliation => affiliations
- extra['affiliations'] = [a['name'] for a in am.get('affiliation')[1:]]
+ extra['affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]
if am.get('sequence') and am.get('sequence') != "additional":
extra['sequence'] = am.get('sequence')
if not extra:
@@ -178,8 +178,8 @@ class CrossrefImporter(EntityImporter):
contribs.append(fatcat_client.ReleaseContrib(
creator_id=creator_id,
index=index,
- raw_name=raw_name,
- raw_affiliation=raw_affiliation,
+ raw_name=clean(raw_name),
+ raw_affiliation=clean(raw_affiliation),
role=ctype,
extra=extra))
return contribs
@@ -199,9 +199,9 @@ class CrossrefImporter(EntityImporter):
and obj.get('container-title') and len(obj['container-title']) > 0):
ce = fatcat_client.ContainerEntity(
issnl=issnl,
- publisher=publisher,
+ publisher=clean(publisher),
container_type=self.map_container_type(release_type),
- name=obj['container-title'][0])
+ name=clean(obj['container-title'][0], force_xml=True))
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
@@ -257,10 +257,10 @@ class CrossrefImporter(EntityImporter):
# doing lookups would be a second import pass
target_release_id=None,
key=key,
- year=year,
- container_name=container_name,
- title=ref_title,
- locator=ref_locator,
+ year=clean(year),
+ container_name=clean(container_name),
+ title=clean(ref_title),
+ locator=clean(ref_locator),
# TODO: just dump JSON somewhere here?
extra=extra))
@@ -269,7 +269,7 @@ class CrossrefImporter(EntityImporter):
if obj.get('abstract') != None:
abstracts.append(fatcat_client.ReleaseEntityAbstracts(
mimetype="application/xml+jats",
- content=obj.get('abstract')))
+ content=clean(obj.get('abstract'))))
# extra fields
extra = dict()
@@ -279,13 +279,16 @@ class CrossrefImporter(EntityImporter):
# TODO: unpack "container-title" array
val = obj.get(key)
if val:
- extra[key] = val
+ if type(val) == str:
+ extra[key] = clean(val)
+ else:
+ extra[key] = val
if 'license' in extra and extra['license']:
for i in range(len(extra['license'])):
if 'start' in extra['license'][i]:
extra['license'][i]['start'] = extra['license'][i]['start']['date-time']
if len(obj['title']) > 1:
- extra['other-titles'] = obj['title'][1:]
+ extra['other-titles'] = [clean(t) for t in obj['title'][1:]]
# TODO: this should be top-level
extra['is_kept'] = len(obj.get('archive', [])) > 0
@@ -329,13 +332,13 @@ class CrossrefImporter(EntityImporter):
re = fatcat_client.ReleaseEntity(
work_id=None,
container_id=container_id,
- title=obj.get('title', [None])[0],
- original_title=obj.get('original-title', [None])[0],
+ title=clean(obj.get('title', [None])[0], force_xml=True),
+ original_title=clean(obj.get('original-title', [None])[0]),
release_type=release_type,
release_status=release_status,
release_date=release_date,
release_year=release_year,
- publisher=publisher,
+ publisher=clean(publisher),
doi=obj['DOI'].lower(),
pmid=extids['pmid'],
pmcid=extids['pmcid'],
@@ -344,9 +347,9 @@ class CrossrefImporter(EntityImporter):
core_id=extids['core_id'],
arxiv_id=extids['arxiv_id'],
jstor_id=extids['jstor_id'],
- volume=obj.get('volume'),
- issue=obj.get('issue'),
- pages=obj.get('page'),
+ volume=clean(obj.get('volume')),
+ issue=clean(obj.get('issue')),
+ pages=clean(obj.get('page')),
language=None, # crossref doesn't supply language info
license_slug=license_slug,
extra=dict(crossref=extra),
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 4d3b41bc..468b0ede 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -5,7 +5,7 @@ import json
import base64
import datetime
import fatcat_client
-from .common import EntityImporter
+from .common import EntityImporter, clean
MAX_ABSTRACT_BYTES=4096
@@ -82,7 +82,7 @@ class GrobidMetadataImporter(EntityImporter):
abobj = dict(
mimetype="text/plain",
language=None,
- content=obj.get('abstract').strip())
+ content=clean(obj.get('abstract')))
abstracts = [abobj]
else:
abstracts = None
@@ -91,17 +91,18 @@ class GrobidMetadataImporter(EntityImporter):
for i, a in enumerate(obj.get('authors', [])):
contribs.append(fatcat_client.ReleaseContrib(
index=i,
- raw_name=a['name'],
+ raw_name=clean(a['name']),
role="author",
extra=None))
+ # XXX: why is this a dict()? not covered by tests?
refs = []
for raw in obj.get('citations', []):
cite_extra = dict()
ref = dict()
- ref['key'] = raw.get('id')
+ ref['key'] = clean(raw.get('id'))
if raw.get('title'):
- ref['title'] = raw['title'].strip()
+ ref['title'] = clean(raw['title'])
if raw.get('date'):
try:
year = int(raw['date'].strip()[:4])
@@ -110,9 +111,9 @@ class GrobidMetadataImporter(EntityImporter):
pass
for key in ('volume', 'url', 'issue', 'publisher'):
if raw.get(key):
- cite_extra[key] = raw[key].strip()
+ cite_extra[key] = clean(raw[key])
if raw.get('authors'):
- cite_extra['authors'] = [a['name'] for a in raw['authors']]
+ cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]
if cite_extra:
cite_extra = dict(grobid=cite_extra)
else:
@@ -141,15 +142,15 @@ class GrobidMetadataImporter(EntityImporter):
extra = None
re = fatcat_client.ReleaseEntity(
- title=obj['title'].strip(),
+ title=clean(obj['title'], force_xml=True),
release_type="article-journal",
release_date=release_date,
release_year=release_year,
contribs=contribs,
refs=refs,
- publisher=obj['journal'].get('publisher'),
- volume=obj['journal'].get('volume'),
- issue=obj['journal'].get('issue'),
+ publisher=clean(obj['journal'].get('publisher')),
+ volume=clean(obj['journal'].get('volume')),
+ issue=clean(obj['journal'].get('issue')),
abstracts=abstracts,
extra=extra)
return re
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index ff38cd77..ccdb7ec6 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -3,7 +3,7 @@ import sys
import json
import itertools
import fatcat_client
-from .common import EntityImporter
+from .common import EntityImporter, clean
def or_none(s):
@@ -72,8 +72,8 @@ class JournalMetadataImporter(EntityImporter):
)
ce = fatcat_client.ContainerEntity(
issnl=issnl,
- name=title,
- publisher=or_none(row['publisher']),
+ name=clean(title),
+ publisher=or_none(clean(row['publisher'])),
extra=extra)
return ce
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 2be15860..055f9c6a 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -4,7 +4,7 @@ import json
import sqlite3
import itertools
import fatcat_client
-from .common import EntityImporter
+from .common import EntityImporter, clean
class MatchedImporter(EntityImporter):
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 2c39db18..02c9bf00 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -3,7 +3,7 @@ import sys
import json
import itertools
import fatcat_client
-from .common import EntityImporter
+from .common import EntityImporter, clean
def value_or_none(e):
if type(e) == dict:
@@ -63,9 +63,9 @@ class OrcidImporter(EntityImporter):
return None
ce = fatcat_client.CreatorEntity(
orcid=orcid,
- given_name=given,
- surname=sur,
- display_name=display,
+ given_name=clean(given),
+ surname=clean(sur),
+ display_name=clean(display),
extra=extra)
return ce