aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/arabesque.py2
-rw-r--r--python/fatcat_tools/importers/arxiv.py8
-rw-r--r--python/fatcat_tools/importers/chocula.py1
-rw-r--r--python/fatcat_tools/importers/common.py24
-rw-r--r--python/fatcat_tools/importers/crossref.py5
-rw-r--r--python/fatcat_tools/importers/datacite.py2
-rw-r--r--python/fatcat_tools/importers/dblp_container.py3
-rw-r--r--python/fatcat_tools/importers/dblp_release.py19
-rw-r--r--python/fatcat_tools/importers/doaj_article.py17
-rw-r--r--python/fatcat_tools/importers/file_meta.py1
-rw-r--r--python/fatcat_tools/importers/fileset_generic.py1
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py4
-rw-r--r--python/fatcat_tools/importers/ingest.py1
-rw-r--r--python/fatcat_tools/importers/jalc.py10
-rw-r--r--python/fatcat_tools/importers/journal_metadata.py1
-rw-r--r--python/fatcat_tools/importers/jstor.py9
-rw-r--r--python/fatcat_tools/importers/orcid.py3
17 files changed, 70 insertions, 41 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index c8f7c77c..2b0ff7ec 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -1,7 +1,7 @@
import fatcat_openapi_client
-from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex
+from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url
ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL'
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index 43325ebc..fc429fb0 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -1,16 +1,16 @@
+import datetime
+import json
import re
import sys
-import json
-import datetime
+
+import fatcat_openapi_client
from bs4 import BeautifulSoup
from pylatexenc.latex2text import LatexNodes2Text
-import fatcat_openapi_client
from .common import EntityImporter
from .crossref import lookup_license_slug
-
latex2text = LatexNodes2Text()
def latex_to_text(raw):
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 5c9efe94..0b634e73 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -1,5 +1,6 @@
import fatcat_openapi_client
+
from .common import EntityImporter, clean
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 9d22ce83..e33a2012 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -1,32 +1,32 @@
-import re
-import sys
import csv
+import datetime
import json
+import re
import sqlite3
-import datetime
import subprocess
-from collections import Counter
-from typing import Dict, Any, List, Optional, Tuple
-import lxml
+import sys
import xml.etree.ElementTree as ET
+from collections import Counter
+from typing import Any, Dict, List, Optional, Tuple
import elasticsearch
+import fatcat_openapi_client
+import fuzzycat.common
+import fuzzycat.verify
+import lxml
from bs4 import BeautifulSoup
from confluent_kafka import Consumer, KafkaException
-
-import fatcat_openapi_client
from fatcat_openapi_client import ReleaseEntity
from fatcat_openapi_client.rest import ApiException
from fuzzycat.matching import match_release_fuzzy
-import fuzzycat.common
-import fuzzycat.verify
# TODO: refactor so remove need for this (re-imports for backwards compatibility)
-from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401
+from fatcat_tools.normal import LANG_MAP_MARC, b32_hex
+from fatcat_tools.normal import clean_str as clean # noqa: F401
+from fatcat_tools.normal import is_cjk
from fatcat_tools.transforms import entity_to_dict
-
DATE_FMT: str = "%Y-%m-%d"
SANE_MAX_RELEASES: int = 200
SANE_MAX_URLS: int = 100
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 38c19a63..fd6936a4 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -1,14 +1,13 @@
-import sqlite3
import datetime
-from typing import Dict, Optional, Any
+import sqlite3
+from typing import Any, Dict, Optional
import fatcat_openapi_client
from fatcat_openapi_client import ReleaseEntity
from .common import EntityImporter, clean
-
# The docs/guide should be the canonical home for these mappings; update there
# first
# Can get a list of Crossref types (with counts) via API:
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 1593e6f8..a06c68a4 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -10,8 +10,8 @@ functions (parse_datacite_...), which may help testing.
import collections
import datetime
-import re
import json
+import re
import sqlite3
import sys
diff --git a/python/fatcat_tools/importers/dblp_container.py b/python/fatcat_tools/importers/dblp_container.py
index a9f993a8..3d280fb7 100644
--- a/python/fatcat_tools/importers/dblp_container.py
+++ b/python/fatcat_tools/importers/dblp_container.py
@@ -7,8 +7,9 @@ pre-scraped in to JSON from HTML pages.
import sys # noqa: F401
import fatcat_openapi_client
-from fatcat_tools.normal import clean_str
+
from fatcat_tools.importers.common import EntityImporter
+from fatcat_tools.normal import clean_str
class DblpContainerImporter(EntityImporter):
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index fa5cb842..6d028f2f 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -21,18 +21,25 @@ brittle/unreliable TSV lookup mechanism for prefix-to-container_id (as of
December 2020).
"""
-import sys # noqa: F401
+import datetime
import json
+import sys # noqa: F401
import warnings
-import datetime
-from typing import List, Optional, Any
+from typing import Any, List, Optional
import fatcat_openapi_client
-from fatcat_tools.normal import (clean_doi, clean_str, parse_month,
- clean_orcid, clean_hdl,
- clean_arxiv_id, clean_wikidata_qid, clean_isbn13)
from fatcat_tools.importers.common import EntityImporter
+from fatcat_tools.normal import (
+ clean_arxiv_id,
+ clean_doi,
+ clean_hdl,
+ clean_isbn13,
+ clean_orcid,
+ clean_str,
+ clean_wikidata_qid,
+ parse_month,
+)
from fatcat_tools.transforms import entity_to_dict
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index 833089ae..1831c4cd 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -4,15 +4,24 @@ Importer for DOAJ article-level metadata, schema v1.
DOAJ API schema and docs: https://doaj.org/api/v1/docs
"""
-import warnings
import datetime
+import warnings
from typing import List, Optional
import fatcat_openapi_client
-from fatcat_tools.normal import (clean_doi, clean_str, parse_month,
- clean_orcid, detect_text_lang, parse_lang_name, parse_country_name,
- clean_pmid, clean_pmcid)
+
from fatcat_tools.importers.common import EntityImporter
+from fatcat_tools.normal import (
+ clean_doi,
+ clean_orcid,
+ clean_pmcid,
+ clean_pmid,
+ clean_str,
+ detect_text_lang,
+ parse_country_name,
+ parse_lang_name,
+ parse_month,
+)
# Cutoff length for abstracts.
MAX_ABSTRACT_LENGTH = 2048
diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py
index 3d9f5923..0951ed84 100644
--- a/python/fatcat_tools/importers/file_meta.py
+++ b/python/fatcat_tools/importers/file_meta.py
@@ -1,5 +1,6 @@
import fatcat_openapi_client
+
from .common import EntityImporter
diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py
index 13352fb2..43c2a49c 100644
--- a/python/fatcat_tools/importers/fileset_generic.py
+++ b/python/fatcat_tools/importers/fileset_generic.py
@@ -2,6 +2,7 @@
import fatcat_openapi_client
from fatcat_tools import entity_from_dict
+
from .common import EntityImporter
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index a811c856..0f666652 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -1,8 +1,10 @@
#!/usr/bin/env python3
-import json
import base64
+import json
+
import fatcat_openapi_client
+
from .common import EntityImporter, clean, make_rel_url
MAX_ABSTRACT_BYTES=4096
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 4d4efc0a..f0943c1e 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -2,6 +2,7 @@
import datetime
import fatcat_openapi_client
+
from .common import EntityImporter, make_rel_url
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 12f5450f..0a983c5e 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -1,12 +1,14 @@
-import sys
-import sqlite3
import datetime
-from bs4 import BeautifulSoup
+import sqlite3
+import sys
import fatcat_openapi_client
+from bs4 import BeautifulSoup
+
from fatcat_tools.normal import clean_doi
-from .common import EntityImporter, clean, is_cjk, DATE_FMT
+
+from .common import DATE_FMT, EntityImporter, clean, is_cjk
def parse_jalc_persons(raw_persons):
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index 9f3b429f..25d7b3b5 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -1,5 +1,6 @@
import fatcat_openapi_client
+
from .common import EntityImporter, clean
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index 5d35f5e2..d37424d6 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -1,12 +1,13 @@
-import sys
-import json
import datetime
+import json
+import sys
import warnings
-from bs4 import BeautifulSoup
import fatcat_openapi_client
-from .common import EntityImporter, clean, LANG_MAP_MARC
+from bs4 import BeautifulSoup
+
+from .common import LANG_MAP_MARC, EntityImporter, clean
from .crossref import CONTAINER_TYPE_MAP
# TODO: more entries?
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 4412a46d..3bdd23a1 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -1,8 +1,11 @@
import sys
+
import fatcat_openapi_client
+
from .common import EntityImporter, clean
+
def value_or_none(e):
if type(e) == dict:
e = e.get('value')