diff options
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 15 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/crossref.py | 4 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 13 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/issn.py | 9 | 
4 files changed, 25 insertions, 16 deletions
| diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 9cf92b41..e1efde80 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -37,12 +37,21 @@ class FatcatImporter:          print("Processed {} lines, inserted {}, updated {}.".format(              self.counts['processed_lines'], self.counts['insert'], self.counts['update'])) +    def create_row(self, row, editgroup_id=None): +        # sub-classes expected to implement this +        raise NotImplementedError + +    def create_batch(self, rows, editgroup_id=None): +        # sub-classes expected to implement this +        raise NotImplementedError +      def process_source(self, source, group_size=100):          """Creates and auto-accepts editgroup every group_size rows"""          eg = self.api.create_editgroup(              fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) +        i = 0          for i, row in enumerate(source): -            self.create_row(row, editgroup=eg.id) +            self.create_row(row, editgroup_id=eg.id)              if i > 0 and (i % group_size) == 0:                  self.api.accept_editgroup(eg.id)                  eg = self.api.create_editgroup( @@ -57,7 +66,7 @@ class FatcatImporter:              self.counts['processed_lines'] += len(rows)              eg = self.api.create_editgroup(                  fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) -            self.create_batch(rows, editgroup=eg.id) +            self.create_batch(rows, editgroup_id=eg.id)      def process_csv_source(self, source, group_size=100, delimiter=','):          reader = csv.DictReader(source, delimiter=delimiter) @@ -85,7 +94,7 @@ class FatcatImporter:          return container_id      def is_orcid(self, orcid): -        return self._orcid_regex.match(orcid) != None +        return self._orcid_regex.match(orcid) is not None      def lookup_orcid(self, orcid):          """Caches calls to the Orcid lookup API endpoint in a local dict""" diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index fac8f32b..d0a69cd6 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -112,7 +112,7 @@ class CrossrefImporter(FatcatImporter):                      extra['sequence'] = am.get('sequence')                  if not extra:                      extra = None -                assert(ctype in ("author", "editor", "translator")) +                assert ctype in ("author", "editor", "translator")                  contribs.append(fatcat_client.ReleaseContrib(                      creator_id=creator_id,                      index=index, @@ -133,7 +133,7 @@ class CrossrefImporter(FatcatImporter):          publisher = obj.get('publisher')          ce = None -        if (container_id is None and self.create_containers and issnl != None  +        if (container_id is None and self.create_containers and (issnl is not None)              and obj.get('container-title') and len(obj['container-title']) > 0):              ce = fatcat_client.ContainerEntity(                  issnl=issnl, diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index ba8a4e6f..d525d4f7 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -21,7 +21,6 @@ class GrobidMetadataImporter(FatcatImporter):          if not obj.get('title'):              return None -        release = dict()          extra = dict()          if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: @@ -35,7 +34,6 @@ class GrobidMetadataImporter(FatcatImporter):          contribs = []          for i, a in enumerate(obj.get('authors', [])): -            c = dict(raw_name=a['name'], role="author")              contribs.append(fatcat_client.ReleaseContrib(                  index=i,                  raw_name=a['name'], @@ -67,7 +65,6 @@ class GrobidMetadataImporter(FatcatImporter):              ref['extra'] = cite_extra              refs.append(ref) -        release_type = "article-journal"          release_date = None          if obj.get('date'):              # TODO: only returns year, ever? how to handle? @@ -77,7 +74,7 @@ class GrobidMetadataImporter(FatcatImporter):              extra['doi'] = obj['doi']          if obj['journal'] and obj['journal'].get('name'):              extra['container_name'] = obj['journal']['name'] -         +          extra['is_longtail_oa'] = True          # TODO: ISSN/eISSN handling? or just journal name lookup? @@ -89,6 +86,8 @@ class GrobidMetadataImporter(FatcatImporter):          re = fatcat_client.ReleaseEntity(              title=obj['title'].strip(), +            release_type="article-journal", +            release_date=release_date,              contribs=contribs,              refs=refs,              publisher=obj['journal'].get('publisher'), @@ -97,7 +96,7 @@ class GrobidMetadataImporter(FatcatImporter):              abstracts=abstracts,              extra=extra)          return re -     +      # TODO: make this a common function somewhere      def make_url(self, raw):          rel = self.default_link_rel @@ -111,7 +110,7 @@ class GrobidMetadataImporter(FatcatImporter):          return fatcat_client.FileEntityUrls(url=raw, rel=rel)      def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): -         +          sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()          # lookup existing SHA1, or create new entity @@ -141,7 +140,7 @@ class GrobidMetadataImporter(FatcatImporter):          fe.urls.append(              fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))          original_url = self.make_url(original) -        if original_url != None: +        if original_url is not None:              fe.urls.append(original_url)          return fe diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py index 0b0efccb..f702dc60 100644 --- a/python/fatcat_tools/importers/issn.py +++ b/python/fatcat_tools/importers/issn.py @@ -17,6 +17,7 @@ def truthy(s):      if s is None:          return None      s = s.lower() +      if s in ('true', 't', 'yes', 'y', '1'):          return True      elif s in ('false', 'f', 'no', 'n', '0'): @@ -37,12 +38,12 @@ class IssnImporter(FatcatImporter):      def parse_issn_row(self, row):          """          row is a python dict (parsed from CSV). -        returns a ContainerEntity +        returns a ContainerEntity (or None if invalid or couldn't parse)          """          title = or_none(row['title'])          issnl = or_none(row['ISSN-L'])          if title is None or issnl is None: -            return +            return None          extra = dict(              in_doaj=truthy(row['in_doaj']),              in_road=truthy(row['in_road']), @@ -72,7 +73,7 @@ class IssnImporter(FatcatImporter):      def create_batch(self, batch, editgroup=None):          """Reads and processes in batches (not API-call-per-line)"""          objects = [self.parse_issn_row(l) -                   for l in batch if l != None] -        objects = [o for o in objects if o != None] +                   for l in batch if (l is not None)] +        objects = [o for o in objects if (o is not None)]          self.api.create_container_batch(objects, autoaccept="true", editgroup=editgroup)          self.counts['insert'] += len(objects) | 
