aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/grobid_metadata.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/grobid_metadata.py')
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py23
1 files changed, 12 insertions, 11 deletions
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 4d3b41bc..468b0ede 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -5,7 +5,7 @@ import json
import base64
import datetime
import fatcat_client
-from .common import EntityImporter
+from .common import EntityImporter, clean
MAX_ABSTRACT_BYTES=4096
@@ -82,7 +82,7 @@ class GrobidMetadataImporter(EntityImporter):
abobj = dict(
mimetype="text/plain",
language=None,
- content=obj.get('abstract').strip())
+ content=clean(obj.get('abstract')))
abstracts = [abobj]
else:
abstracts = None
@@ -91,17 +91,18 @@ class GrobidMetadataImporter(EntityImporter):
for i, a in enumerate(obj.get('authors', [])):
contribs.append(fatcat_client.ReleaseContrib(
index=i,
- raw_name=a['name'],
+ raw_name=clean(a['name']),
role="author",
extra=None))
+ # XXX: why is this a dict()? not covered by tests?
refs = []
for raw in obj.get('citations', []):
cite_extra = dict()
ref = dict()
- ref['key'] = raw.get('id')
+ ref['key'] = clean(raw.get('id'))
if raw.get('title'):
- ref['title'] = raw['title'].strip()
+ ref['title'] = clean(raw['title'])
if raw.get('date'):
try:
year = int(raw['date'].strip()[:4])
@@ -110,9 +111,9 @@ class GrobidMetadataImporter(EntityImporter):
pass
for key in ('volume', 'url', 'issue', 'publisher'):
if raw.get(key):
- cite_extra[key] = raw[key].strip()
+ cite_extra[key] = clean(raw[key])
if raw.get('authors'):
- cite_extra['authors'] = [a['name'] for a in raw['authors']]
+ cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]
if cite_extra:
cite_extra = dict(grobid=cite_extra)
else:
@@ -141,15 +142,15 @@ class GrobidMetadataImporter(EntityImporter):
extra = None
re = fatcat_client.ReleaseEntity(
- title=obj['title'].strip(),
+ title=clean(obj['title'], force_xml=True),
release_type="article-journal",
release_date=release_date,
release_year=release_year,
contribs=contribs,
refs=refs,
- publisher=obj['journal'].get('publisher'),
- volume=obj['journal'].get('volume'),
- issue=obj['journal'].get('issue'),
+ publisher=clean(obj['journal'].get('publisher')),
+ volume=clean(obj['journal'].get('volume')),
+ issue=clean(obj['journal'].get('issue')),
abstracts=abstracts,
extra=extra)
return re