aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/arabesque.py7
-rw-r--r--python/fatcat_tools/importers/arxiv.py9
-rwxr-xr-xpython/fatcat_tools/importers/cdl_dash_dat.py4
-rw-r--r--python/fatcat_tools/importers/chocula.py3
-rw-r--r--python/fatcat_tools/importers/common.py26
-rw-r--r--python/fatcat_tools/importers/crossref.py8
-rw-r--r--python/fatcat_tools/importers/datacite.py2
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py2
-rw-r--r--python/fatcat_tools/importers/ingest.py7
-rw-r--r--python/fatcat_tools/importers/jalc.py3
-rw-r--r--python/fatcat_tools/importers/journal_metadata.py3
-rw-r--r--python/fatcat_tools/importers/jstor.py2
-rw-r--r--python/fatcat_tools/importers/matched.py8
-rw-r--r--python/fatcat_tools/importers/orcid.py4
-rw-r--r--python/fatcat_tools/importers/pubmed.py6
-rw-r--r--python/fatcat_tools/importers/shadow.py5
16 files changed, 27 insertions, 72 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index c71b33e9..47a8c4da 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -1,10 +1,6 @@
-import sys
-import json
-import sqlite3
-import itertools
import fatcat_openapi_client
-from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex
+from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex
ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL'
@@ -186,4 +182,3 @@ class ArabesqueMatchImporter(EntityImporter):
description=self.editgroup_description,
extra=self.editgroup_extra),
entity_list=batch))
-
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index 719592fc..43325ebc 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -7,7 +7,7 @@ from bs4 import BeautifulSoup
from pylatexenc.latex2text import LatexNodes2Text
import fatcat_openapi_client
-from .common import EntityImporter, clean
+from .common import EntityImporter
from .crossref import lookup_license_slug
@@ -97,7 +97,6 @@ class ArxivRawImporter(EntityImporter):
**kwargs)
self._test_override = False
-
def parse_record(self, record):
if not record:
@@ -188,7 +187,6 @@ class ArxivRawImporter(EntityImporter):
if lang == 'en':
lang = None
-
# extra:
# withdrawn_date
# translation_of
@@ -244,7 +242,7 @@ class ArxivRawImporter(EntityImporter):
For each version, do a lookup by full arxiv_id, and store work/release
id results.
-
+
If a version has a DOI, also do a doi lookup and store that result. If
there is an existing release with both matching, set that as the
existing work. If they don't match, use the full arxiv_id match and
@@ -345,6 +343,7 @@ class ArxivRawImporter(EntityImporter):
print(json.dumps(resp))
#sys.exit(-1)
-if __name__=='__main__':
+
+if __name__ == '__main__':
parser = ArxivRawImporter(None)
parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
index 536c013b..36a2f9a6 100755
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ b/python/fatcat_tools/importers/cdl_dash_dat.py
@@ -82,7 +82,7 @@ def cdl_dash_release(meta, extra=None):
#print(abstracts)
if not abstracts:
abstracts = None
-
+
contribs = []
for creator in meta['creator']:
contribs.append(ReleaseContrib(
@@ -120,7 +120,7 @@ def make_release_fileset(dat_path):
with open(dat_path + "/cdl_dash_metadata.json", 'r') as fp:
meta_dict = json.loads(fp.read())
-
+
release = cdl_dash_release(meta_dict)
ark_id = release.extra['ark_id']
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 375b6051..d5d1cce8 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -1,7 +1,4 @@
-import sys
-import json
-import itertools
import fatcat_openapi_client
from .common import EntityImporter, clean
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index eafc6546..c0578224 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -161,18 +161,18 @@ def is_cjk(s):
return False
def test_is_cjk():
- assert is_cjk(None) == False
- assert is_cjk('') == False
- assert is_cjk('blah') == False
- assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True
- assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') == True
- assert is_cjk('菊') == True
- assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') == True
- assert is_cjk('水道') == True
- assert is_cjk('オウ, イク') == True # kanji
- assert is_cjk('ひヒ') == True
- assert is_cjk('き゚ゅ') == True
- assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True
+ assert is_cjk(None) is False
+ assert is_cjk('') is False
+ assert is_cjk('blah') is False
+ assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True
+ assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True
+ assert is_cjk('菊') is True
+ assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True
+ assert is_cjk('水道') is True
+ assert is_cjk('オウ, イク') is True # kanji
+ assert is_cjk('ひヒ') is True
+ assert is_cjk('き゚ゅ') is True
+ assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True
DOMAIN_REL_MAP = {
"archive.org": "archive",
@@ -368,7 +368,7 @@ class EntityImporter:
if self._entity_queue:
self.insert_batch(self._entity_queue)
self.counts['insert'] += len(self._entity_queue)
- self._entity_queue = []
+ self._entity_queue = []
return self.counts
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index d26f089f..854e3d9f 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -1,10 +1,6 @@
-import sys
-import json
import sqlite3
import datetime
-import itertools
-import subprocess
import fatcat_openapi_client
from .common import EntityImporter, clean
@@ -425,7 +421,6 @@ class CrossrefImporter(EntityImporter):
release_year = raw_date[0]
release_date = None
-
original_title = None
if obj.get('original-title'):
original_title = clean(obj.get('original-title')[0], force_xml=True)
@@ -500,7 +495,7 @@ class CrossrefImporter(EntityImporter):
if existing:
self.counts['exists'] += 1
return False
-
+
return True
def insert_batch(self, batch):
@@ -509,4 +504,3 @@ class CrossrefImporter(EntityImporter):
description=self.editgroup_description,
extra=self.editgroup_extra),
entity_list=batch))
-
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 434a2941..08c85b30 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -10,7 +10,6 @@ functions (parse_datacite_...), which may help testing.
import collections
import datetime
-import hashlib
import re
import json
import sqlite3
@@ -292,7 +291,6 @@ class DataciteImporter(EntityImporter):
print('[{}] skipping non-ascii doi for now'.format(doi))
return None
-
creators = attributes.get('creators', []) or []
contributors = attributes.get('contributors', []) or [] # Much fewer than creators.
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 2077eae4..5ec6cc3c 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -1,9 +1,7 @@
#!/usr/bin/env python3
-import sys
import json
import base64
-import datetime
import fatcat_openapi_client
from .common import EntityImporter, clean, make_rel_url
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 2b630e67..4b1d3702 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -1,10 +1,6 @@
-import sys
-import json
-import base64
-import itertools
import fatcat_openapi_client
-from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex
+from .common import EntityImporter, make_rel_url
class IngestFileResultImporter(EntityImporter):
@@ -284,4 +280,3 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
description=self.editgroup_description,
extra=self.editgroup_extra),
entity_list=batch))
-
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index e30bb233..38aa00eb 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -1,10 +1,7 @@
import sys
-import json
import sqlite3
import datetime
-import itertools
-import subprocess
from bs4 import BeautifulSoup
import fatcat_openapi_client
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index d439c80a..32782eac 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -1,7 +1,4 @@
-import sys
-import json
-import itertools
import fatcat_openapi_client
from .common import EntityImporter, clean
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index 96dbf947..5d35f5e2 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -183,7 +183,7 @@ class JstorImporter(EntityImporter):
# suspect jan 1st dates get set by JSTOR when actual
# date not known (citation needed), so drop them
release_date = None
-
+
volume = None
if article_meta.volume:
volume = article_meta.volume.string or None
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 180d7ba3..d95c5847 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -1,12 +1,8 @@
-import sys
-import json
-import sqlite3
-import itertools
import fatcat_openapi_client
from fatcat_tools.normal import *
-from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS
+from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS
class MatchedImporter(EntityImporter):
@@ -160,7 +156,6 @@ class MatchedImporter(EntityImporter):
self.counts['skip-update-inflight'] += 1
return False
-
# minimum viable "existing" URL cleanup to fix dupes and broken links:
# remove 'None' wayback URLs, and set archive.org rel 'archive'
existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
@@ -207,4 +202,3 @@ class MatchedImporter(EntityImporter):
description=self.editgroup_description,
extra=self.editgroup_extra),
entity_list=batch))
-
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 554e052f..21feea9e 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -1,7 +1,5 @@
import sys
-import json
-import itertools
import fatcat_openapi_client
from .common import EntityImporter, clean
@@ -89,7 +87,7 @@ class OrcidImporter(EntityImporter):
if existing:
self.counts['exists'] += 1
return False
-
+
return True
def insert_batch(self, batch):
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 3d3e3a8c..d8a6842c 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -1,11 +1,9 @@
import sys
import json
-import sqlite3
import datetime
import warnings
from bs4 import BeautifulSoup
-from bs4.element import NavigableString
import fatcat_openapi_client
from fatcat_tools.normal import *
@@ -314,7 +312,7 @@ class PubmedImporter(EntityImporter):
Importer for PubMed/MEDLINE XML metadata.
If lookup_refs is true, will do identifer-based lookups for all references.
-
+
TODO: MEDLINE doesn't include PMC/OA license; could include in importer?
"""
@@ -502,7 +500,7 @@ class PubmedImporter(EntityImporter):
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
self._issnl_id_map[issnl] = container_id
-
+
ji = journal.JournalIssue
volume = None
if ji.find("Volume"):
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py
index 4cd22775..c04e9aa8 100644
--- a/python/fatcat_tools/importers/shadow.py
+++ b/python/fatcat_tools/importers/shadow.py
@@ -1,8 +1,4 @@
-import sys
-import json
-import sqlite3
-import itertools
import fatcat_openapi_client
from fatcat_tools.normal import *
@@ -192,4 +188,3 @@ class ShadowLibraryImporter(EntityImporter):
description=self.editgroup_description,
extra=self.editgroup_extra),
entity_list=batch))
-