diff options
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/common.py | 15 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 13 | ||||
-rw-r--r-- | python/fatcat_tools/importers/issn.py | 9 |
4 files changed, 25 insertions, 16 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 9cf92b41..e1efde80 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -37,12 +37,21 @@ class FatcatImporter: print("Processed {} lines, inserted {}, updated {}.".format( self.counts['processed_lines'], self.counts['insert'], self.counts['update'])) + def create_row(self, row, editgroup_id=None): + # sub-classes expected to implement this + raise NotImplementedError + + def create_batch(self, rows, editgroup_id=None): + # sub-classes expected to implement this + raise NotImplementedError + def process_source(self, source, group_size=100): """Creates and auto-accepts editgroup every group_size rows""" eg = self.api.create_editgroup( fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) + i = 0 for i, row in enumerate(source): - self.create_row(row, editgroup=eg.id) + self.create_row(row, editgroup_id=eg.id) if i > 0 and (i % group_size) == 0: self.api.accept_editgroup(eg.id) eg = self.api.create_editgroup( @@ -57,7 +66,7 @@ class FatcatImporter: self.counts['processed_lines'] += len(rows) eg = self.api.create_editgroup( fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) - self.create_batch(rows, editgroup=eg.id) + self.create_batch(rows, editgroup_id=eg.id) def process_csv_source(self, source, group_size=100, delimiter=','): reader = csv.DictReader(source, delimiter=delimiter) @@ -85,7 +94,7 @@ class FatcatImporter: return container_id def is_orcid(self, orcid): - return self._orcid_regex.match(orcid) != None + return self._orcid_regex.match(orcid) is not None def lookup_orcid(self, orcid): """Caches calls to the Orcid lookup API endpoint in a local dict""" diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index fac8f32b..d0a69cd6 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -112,7 +112,7 @@ class CrossrefImporter(FatcatImporter): extra['sequence'] = am.get('sequence') if not extra: extra = None - assert(ctype in ("author", "editor", "translator")) + assert ctype in ("author", "editor", "translator") contribs.append(fatcat_client.ReleaseContrib( creator_id=creator_id, index=index, @@ -133,7 +133,7 @@ class CrossrefImporter(FatcatImporter): publisher = obj.get('publisher') ce = None - if (container_id is None and self.create_containers and issnl != None + if (container_id is None and self.create_containers and (issnl is not None) and obj.get('container-title') and len(obj['container-title']) > 0): ce = fatcat_client.ContainerEntity( issnl=issnl, diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index ba8a4e6f..d525d4f7 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -21,7 +21,6 @@ class GrobidMetadataImporter(FatcatImporter): if not obj.get('title'): return None - release = dict() extra = dict() if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: @@ -35,7 +34,6 @@ class GrobidMetadataImporter(FatcatImporter): contribs = [] for i, a in enumerate(obj.get('authors', [])): - c = dict(raw_name=a['name'], role="author") contribs.append(fatcat_client.ReleaseContrib( index=i, raw_name=a['name'], @@ -67,7 +65,6 @@ class GrobidMetadataImporter(FatcatImporter): ref['extra'] = cite_extra refs.append(ref) - release_type = "article-journal" release_date = None if obj.get('date'): # TODO: only returns year, ever? how to handle? @@ -77,7 +74,7 @@ class GrobidMetadataImporter(FatcatImporter): extra['doi'] = obj['doi'] if obj['journal'] and obj['journal'].get('name'): extra['container_name'] = obj['journal']['name'] - + extra['is_longtail_oa'] = True # TODO: ISSN/eISSN handling? or just journal name lookup? @@ -89,6 +86,8 @@ class GrobidMetadataImporter(FatcatImporter): re = fatcat_client.ReleaseEntity( title=obj['title'].strip(), + release_type="article-journal", + release_date=release_date, contribs=contribs, refs=refs, publisher=obj['journal'].get('publisher'), @@ -97,7 +96,7 @@ class GrobidMetadataImporter(FatcatImporter): abstracts=abstracts, extra=extra) return re - + # TODO: make this a common function somewhere def make_url(self, raw): rel = self.default_link_rel @@ -111,7 +110,7 @@ class GrobidMetadataImporter(FatcatImporter): return fatcat_client.FileEntityUrls(url=raw, rel=rel) def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): - + sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() # lookup existing SHA1, or create new entity @@ -141,7 +140,7 @@ class GrobidMetadataImporter(FatcatImporter): fe.urls.append( fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) original_url = self.make_url(original) - if original_url != None: + if original_url is not None: fe.urls.append(original_url) return fe diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py index 0b0efccb..f702dc60 100644 --- a/python/fatcat_tools/importers/issn.py +++ b/python/fatcat_tools/importers/issn.py @@ -17,6 +17,7 @@ def truthy(s): if s is None: return None s = s.lower() + if s in ('true', 't', 'yes', 'y', '1'): return True elif s in ('false', 'f', 'no', 'n', '0'): @@ -37,12 +38,12 @@ class IssnImporter(FatcatImporter): def parse_issn_row(self, row): """ row is a python dict (parsed from CSV). - returns a ContainerEntity + returns a ContainerEntity (or None if invalid or couldn't parse) """ title = or_none(row['title']) issnl = or_none(row['ISSN-L']) if title is None or issnl is None: - return + return None extra = dict( in_doaj=truthy(row['in_doaj']), in_road=truthy(row['in_road']), @@ -72,7 +73,7 @@ class IssnImporter(FatcatImporter): def create_batch(self, batch, editgroup=None): """Reads and processes in batches (not API-call-per-line)""" objects = [self.parse_issn_row(l) - for l in batch if l != None] - objects = [o for o in objects if o != None] + for l in batch if (l is not None)] + objects = [o for o in objects if (o is not None)] self.api.create_container_batch(objects, autoaccept="true", editgroup=editgroup) self.counts['insert'] += len(objects) |