diff options
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/arabesque.py | 7 | ||||
-rw-r--r-- | python/fatcat_tools/importers/arxiv.py | 9 | ||||
-rwxr-xr-x | python/fatcat_tools/importers/cdl_dash_dat.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/importers/chocula.py | 3 | ||||
-rw-r--r-- | python/fatcat_tools/importers/common.py | 26 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 8 | ||||
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 7 | ||||
-rw-r--r-- | python/fatcat_tools/importers/jalc.py | 3 | ||||
-rw-r--r-- | python/fatcat_tools/importers/journal_metadata.py | 3 | ||||
-rw-r--r-- | python/fatcat_tools/importers/jstor.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/importers/matched.py | 8 | ||||
-rw-r--r-- | python/fatcat_tools/importers/orcid.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 6 | ||||
-rw-r--r-- | python/fatcat_tools/importers/shadow.py | 5 |
16 files changed, 27 insertions, 72 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index c71b33e9..47a8c4da 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -1,10 +1,6 @@ -import sys -import json -import sqlite3 -import itertools import fatcat_openapi_client -from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex +from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL' @@ -186,4 +182,3 @@ class ArabesqueMatchImporter(EntityImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 719592fc..43325ebc 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -7,7 +7,7 @@ from bs4 import BeautifulSoup from pylatexenc.latex2text import LatexNodes2Text import fatcat_openapi_client -from .common import EntityImporter, clean +from .common import EntityImporter from .crossref import lookup_license_slug @@ -97,7 +97,6 @@ class ArxivRawImporter(EntityImporter): **kwargs) self._test_override = False - def parse_record(self, record): if not record: @@ -188,7 +187,6 @@ class ArxivRawImporter(EntityImporter): if lang == 'en': lang = None - # extra: # withdrawn_date # translation_of @@ -244,7 +242,7 @@ class ArxivRawImporter(EntityImporter): For each version, do a lookup by full arxiv_id, and store work/release id results. - + If a version has a DOI, also do a doi lookup and store that result. If there is an existing release with both matching, set that as the existing work. If they don't match, use the full arxiv_id match and @@ -345,6 +343,7 @@ class ArxivRawImporter(EntityImporter): print(json.dumps(resp)) #sys.exit(-1) -if __name__=='__main__': + +if __name__ == '__main__': parser = ArxivRawImporter(None) parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py index 536c013b..36a2f9a6 100755 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -82,7 +82,7 @@ def cdl_dash_release(meta, extra=None): #print(abstracts) if not abstracts: abstracts = None - + contribs = [] for creator in meta['creator']: contribs.append(ReleaseContrib( @@ -120,7 +120,7 @@ def make_release_fileset(dat_path): with open(dat_path + "/cdl_dash_metadata.json", 'r') as fp: meta_dict = json.loads(fp.read()) - + release = cdl_dash_release(meta_dict) ark_id = release.extra['ark_id'] diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 375b6051..d5d1cce8 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -1,7 +1,4 @@ -import sys -import json -import itertools import fatcat_openapi_client from .common import EntityImporter, clean diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index eafc6546..c0578224 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -161,18 +161,18 @@ def is_cjk(s): return False def test_is_cjk(): - assert is_cjk(None) == False - assert is_cjk('') == False - assert is_cjk('blah') == False - assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True - assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') == True - assert is_cjk('菊') == True - assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') == True - assert is_cjk('水道') == True - assert is_cjk('オウ, イク') == True # kanji - assert is_cjk('ひヒ') == True - assert is_cjk('き゚ゅ') == True - assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True + assert is_cjk(None) is False + assert is_cjk('') is False + assert is_cjk('blah') is False + assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True + assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True + assert is_cjk('菊') is True + assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True + assert is_cjk('水道') is True + assert is_cjk('オウ, イク') is True # kanji + assert is_cjk('ひヒ') is True + assert is_cjk('き゚ゅ') is True + assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True DOMAIN_REL_MAP = { "archive.org": "archive", @@ -368,7 +368,7 @@ class EntityImporter: if self._entity_queue: self.insert_batch(self._entity_queue) self.counts['insert'] += len(self._entity_queue) - self._entity_queue = [] + self._entity_queue = [] return self.counts diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index d26f089f..854e3d9f 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,10 +1,6 @@ -import sys -import json import sqlite3 import datetime -import itertools -import subprocess import fatcat_openapi_client from .common import EntityImporter, clean @@ -425,7 +421,6 @@ class CrossrefImporter(EntityImporter): release_year = raw_date[0] release_date = None - original_title = None if obj.get('original-title'): original_title = clean(obj.get('original-title')[0], force_xml=True) @@ -500,7 +495,7 @@ class CrossrefImporter(EntityImporter): if existing: self.counts['exists'] += 1 return False - + return True def insert_batch(self, batch): @@ -509,4 +504,3 @@ class CrossrefImporter(EntityImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 434a2941..08c85b30 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -10,7 +10,6 @@ functions (parse_datacite_...), which may help testing. import collections import datetime -import hashlib import re import json import sqlite3 @@ -292,7 +291,6 @@ class DataciteImporter(EntityImporter): print('[{}] skipping non-ascii doi for now'.format(doi)) return None - creators = attributes.get('creators', []) or [] contributors = attributes.get('contributors', []) or [] # Much fewer than creators. diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 2077eae4..5ec6cc3c 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -1,9 +1,7 @@ #!/usr/bin/env python3 -import sys import json import base64 -import datetime import fatcat_openapi_client from .common import EntityImporter, clean, make_rel_url diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 2b630e67..4b1d3702 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -1,10 +1,6 @@ -import sys -import json -import base64 -import itertools import fatcat_openapi_client -from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex +from .common import EntityImporter, make_rel_url class IngestFileResultImporter(EntityImporter): @@ -284,4 +280,3 @@ class SavePaperNowFileImporter(IngestFileResultImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index e30bb233..38aa00eb 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,10 +1,7 @@ import sys -import json import sqlite3 import datetime -import itertools -import subprocess from bs4 import BeautifulSoup import fatcat_openapi_client diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index d439c80a..32782eac 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -1,7 +1,4 @@ -import sys -import json -import itertools import fatcat_openapi_client from .common import EntityImporter, clean diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 96dbf947..5d35f5e2 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -183,7 +183,7 @@ class JstorImporter(EntityImporter): # suspect jan 1st dates get set by JSTOR when actual # date not known (citation needed), so drop them release_date = None - + volume = None if article_meta.volume: volume = article_meta.volume.string or None diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 180d7ba3..d95c5847 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -1,12 +1,8 @@ -import sys -import json -import sqlite3 -import itertools import fatcat_openapi_client from fatcat_tools.normal import * -from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS +from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS class MatchedImporter(EntityImporter): @@ -160,7 +156,6 @@ class MatchedImporter(EntityImporter): self.counts['skip-update-inflight'] += 1 return False - # minimum viable "existing" URL cleanup to fix dupes and broken links: # remove 'None' wayback URLs, and set archive.org rel 'archive' existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] @@ -207,4 +202,3 @@ class MatchedImporter(EntityImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 554e052f..21feea9e 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -1,7 +1,5 @@ import sys -import json -import itertools import fatcat_openapi_client from .common import EntityImporter, clean @@ -89,7 +87,7 @@ class OrcidImporter(EntityImporter): if existing: self.counts['exists'] += 1 return False - + return True def insert_batch(self, batch): diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3d3e3a8c..d8a6842c 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -1,11 +1,9 @@ import sys import json -import sqlite3 import datetime import warnings from bs4 import BeautifulSoup -from bs4.element import NavigableString import fatcat_openapi_client from fatcat_tools.normal import * @@ -314,7 +312,7 @@ class PubmedImporter(EntityImporter): Importer for PubMed/MEDLINE XML metadata. If lookup_refs is true, will do identifer-based lookups for all references. - + TODO: MEDLINE doesn't include PMC/OA license; could include in importer? """ @@ -502,7 +500,7 @@ class PubmedImporter(EntityImporter): ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id - + ji = journal.JournalIssue volume = None if ji.find("Volume"): diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index 4cd22775..c04e9aa8 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -1,8 +1,4 @@ -import sys -import json -import sqlite3 -import itertools import fatcat_openapi_client from fatcat_tools.normal import * @@ -192,4 +188,3 @@ class ShadowLibraryImporter(EntityImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) - |