summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/common.py15
-rw-r--r--python/fatcat_tools/importers/crossref.py4
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py13
-rw-r--r--python/fatcat_tools/importers/issn.py9
4 files changed, 25 insertions, 16 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 9cf92b41..e1efde80 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -37,12 +37,21 @@ class FatcatImporter:
print("Processed {} lines, inserted {}, updated {}.".format(
self.counts['processed_lines'], self.counts['insert'], self.counts['update']))
+ def create_row(self, row, editgroup_id=None):
+ # sub-classes expected to implement this
+ raise NotImplementedError
+
+ def create_batch(self, rows, editgroup_id=None):
+ # sub-classes expected to implement this
+ raise NotImplementedError
+
def process_source(self, source, group_size=100):
"""Creates and auto-accepts editgroup every group_size rows"""
eg = self.api.create_editgroup(
fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae'))
+ i = 0
for i, row in enumerate(source):
- self.create_row(row, editgroup=eg.id)
+ self.create_row(row, editgroup_id=eg.id)
if i > 0 and (i % group_size) == 0:
self.api.accept_editgroup(eg.id)
eg = self.api.create_editgroup(
@@ -57,7 +66,7 @@ class FatcatImporter:
self.counts['processed_lines'] += len(rows)
eg = self.api.create_editgroup(
fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae'))
- self.create_batch(rows, editgroup=eg.id)
+ self.create_batch(rows, editgroup_id=eg.id)
def process_csv_source(self, source, group_size=100, delimiter=','):
reader = csv.DictReader(source, delimiter=delimiter)
@@ -85,7 +94,7 @@ class FatcatImporter:
return container_id
def is_orcid(self, orcid):
- return self._orcid_regex.match(orcid) != None
+ return self._orcid_regex.match(orcid) is not None
def lookup_orcid(self, orcid):
"""Caches calls to the Orcid lookup API endpoint in a local dict"""
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index fac8f32b..d0a69cd6 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -112,7 +112,7 @@ class CrossrefImporter(FatcatImporter):
extra['sequence'] = am.get('sequence')
if not extra:
extra = None
- assert(ctype in ("author", "editor", "translator"))
+ assert ctype in ("author", "editor", "translator")
contribs.append(fatcat_client.ReleaseContrib(
creator_id=creator_id,
index=index,
@@ -133,7 +133,7 @@ class CrossrefImporter(FatcatImporter):
publisher = obj.get('publisher')
ce = None
- if (container_id is None and self.create_containers and issnl != None
+ if (container_id is None and self.create_containers and (issnl is not None)
and obj.get('container-title') and len(obj['container-title']) > 0):
ce = fatcat_client.ContainerEntity(
issnl=issnl,
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index ba8a4e6f..d525d4f7 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -21,7 +21,6 @@ class GrobidMetadataImporter(FatcatImporter):
if not obj.get('title'):
return None
- release = dict()
extra = dict()
if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
@@ -35,7 +34,6 @@ class GrobidMetadataImporter(FatcatImporter):
contribs = []
for i, a in enumerate(obj.get('authors', [])):
- c = dict(raw_name=a['name'], role="author")
contribs.append(fatcat_client.ReleaseContrib(
index=i,
raw_name=a['name'],
@@ -67,7 +65,6 @@ class GrobidMetadataImporter(FatcatImporter):
ref['extra'] = cite_extra
refs.append(ref)
- release_type = "article-journal"
release_date = None
if obj.get('date'):
# TODO: only returns year, ever? how to handle?
@@ -77,7 +74,7 @@ class GrobidMetadataImporter(FatcatImporter):
extra['doi'] = obj['doi']
if obj['journal'] and obj['journal'].get('name'):
extra['container_name'] = obj['journal']['name']
-
+
extra['is_longtail_oa'] = True
# TODO: ISSN/eISSN handling? or just journal name lookup?
@@ -89,6 +86,8 @@ class GrobidMetadataImporter(FatcatImporter):
re = fatcat_client.ReleaseEntity(
title=obj['title'].strip(),
+ release_type="article-journal",
+ release_date=release_date,
contribs=contribs,
refs=refs,
publisher=obj['journal'].get('publisher'),
@@ -97,7 +96,7 @@ class GrobidMetadataImporter(FatcatImporter):
abstracts=abstracts,
extra=extra)
return re
-
+
# TODO: make this a common function somewhere
def make_url(self, raw):
rel = self.default_link_rel
@@ -111,7 +110,7 @@ class GrobidMetadataImporter(FatcatImporter):
return fatcat_client.FileEntityUrls(url=raw, rel=rel)
def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size):
-
+
sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()
# lookup existing SHA1, or create new entity
@@ -141,7 +140,7 @@ class GrobidMetadataImporter(FatcatImporter):
fe.urls.append(
fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))
original_url = self.make_url(original)
- if original_url != None:
+ if original_url is not None:
fe.urls.append(original_url)
return fe
diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py
index 0b0efccb..f702dc60 100644
--- a/python/fatcat_tools/importers/issn.py
+++ b/python/fatcat_tools/importers/issn.py
@@ -17,6 +17,7 @@ def truthy(s):
if s is None:
return None
s = s.lower()
+
if s in ('true', 't', 'yes', 'y', '1'):
return True
elif s in ('false', 'f', 'no', 'n', '0'):
@@ -37,12 +38,12 @@ class IssnImporter(FatcatImporter):
def parse_issn_row(self, row):
"""
row is a python dict (parsed from CSV).
- returns a ContainerEntity
+ returns a ContainerEntity (or None if invalid or couldn't parse)
"""
title = or_none(row['title'])
issnl = or_none(row['ISSN-L'])
if title is None or issnl is None:
- return
+ return None
extra = dict(
in_doaj=truthy(row['in_doaj']),
in_road=truthy(row['in_road']),
@@ -72,7 +73,7 @@ class IssnImporter(FatcatImporter):
def create_batch(self, batch, editgroup=None):
"""Reads and processes in batches (not API-call-per-line)"""
objects = [self.parse_issn_row(l)
- for l in batch if l != None]
- objects = [o for o in objects if o != None]
+ for l in batch if (l is not None)]
+ objects = [o for o in objects if (o is not None)]
self.api.create_container_batch(objects, autoaccept="true", editgroup=editgroup)
self.counts['insert'] += len(objects)