diff options
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/arabesque.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/importers/arxiv.py | 8 | ||||
-rw-r--r-- | python/fatcat_tools/importers/chocula.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/common.py | 24 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 5 | ||||
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/importers/dblp_container.py | 3 | ||||
-rw-r--r-- | python/fatcat_tools/importers/dblp_release.py | 19 | ||||
-rw-r--r-- | python/fatcat_tools/importers/doaj_article.py | 17 | ||||
-rw-r--r-- | python/fatcat_tools/importers/file_meta.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/fileset_generic.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/jalc.py | 10 | ||||
-rw-r--r-- | python/fatcat_tools/importers/journal_metadata.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/jstor.py | 9 | ||||
-rw-r--r-- | python/fatcat_tools/importers/orcid.py | 3 |
17 files changed, 70 insertions, 41 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index c8f7c77c..2b0ff7ec 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -1,7 +1,7 @@ import fatcat_openapi_client -from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex +from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL' diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 43325ebc..fc429fb0 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -1,16 +1,16 @@ +import datetime +import json import re import sys -import json -import datetime + +import fatcat_openapi_client from bs4 import BeautifulSoup from pylatexenc.latex2text import LatexNodes2Text -import fatcat_openapi_client from .common import EntityImporter from .crossref import lookup_license_slug - latex2text = LatexNodes2Text() def latex_to_text(raw): diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 5c9efe94..0b634e73 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -1,5 +1,6 @@ import fatcat_openapi_client + from .common import EntityImporter, clean diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 9d22ce83..e33a2012 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -1,32 +1,32 @@ -import re -import sys import csv +import datetime import json +import re import sqlite3 -import datetime import subprocess -from collections import Counter -from typing import Dict, Any, List, Optional, Tuple -import lxml +import sys import xml.etree.ElementTree as ET +from collections import Counter +from typing import Any, Dict, List, Optional, Tuple import elasticsearch +import fatcat_openapi_client +import fuzzycat.common +import fuzzycat.verify +import lxml from bs4 import BeautifulSoup from confluent_kafka import Consumer, KafkaException - -import fatcat_openapi_client from fatcat_openapi_client import ReleaseEntity from fatcat_openapi_client.rest import ApiException from fuzzycat.matching import match_release_fuzzy -import fuzzycat.common -import fuzzycat.verify # TODO: refactor so remove need for this (re-imports for backwards compatibility) -from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401 +from fatcat_tools.normal import LANG_MAP_MARC, b32_hex +from fatcat_tools.normal import clean_str as clean # noqa: F401 +from fatcat_tools.normal import is_cjk from fatcat_tools.transforms import entity_to_dict - DATE_FMT: str = "%Y-%m-%d" SANE_MAX_RELEASES: int = 200 SANE_MAX_URLS: int = 100 diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 38c19a63..fd6936a4 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,14 +1,13 @@ -import sqlite3 import datetime -from typing import Dict, Optional, Any +import sqlite3 +from typing import Any, Dict, Optional import fatcat_openapi_client from fatcat_openapi_client import ReleaseEntity from .common import EntityImporter, clean - # The docs/guide should be the canonical home for these mappings; update there # first # Can get a list of Crossref types (with counts) via API: diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 1593e6f8..a06c68a4 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -10,8 +10,8 @@ functions (parse_datacite_...), which may help testing. import collections import datetime -import re import json +import re import sqlite3 import sys diff --git a/python/fatcat_tools/importers/dblp_container.py b/python/fatcat_tools/importers/dblp_container.py index a9f993a8..3d280fb7 100644 --- a/python/fatcat_tools/importers/dblp_container.py +++ b/python/fatcat_tools/importers/dblp_container.py @@ -7,8 +7,9 @@ pre-scraped in to JSON from HTML pages. import sys # noqa: F401 import fatcat_openapi_client -from fatcat_tools.normal import clean_str + from fatcat_tools.importers.common import EntityImporter +from fatcat_tools.normal import clean_str class DblpContainerImporter(EntityImporter): diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index fa5cb842..6d028f2f 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -21,18 +21,25 @@ brittle/unreliable TSV lookup mechanism for prefix-to-container_id (as of December 2020). """ -import sys # noqa: F401 +import datetime import json +import sys # noqa: F401 import warnings -import datetime -from typing import List, Optional, Any +from typing import Any, List, Optional import fatcat_openapi_client -from fatcat_tools.normal import (clean_doi, clean_str, parse_month, - clean_orcid, clean_hdl, - clean_arxiv_id, clean_wikidata_qid, clean_isbn13) from fatcat_tools.importers.common import EntityImporter +from fatcat_tools.normal import ( + clean_arxiv_id, + clean_doi, + clean_hdl, + clean_isbn13, + clean_orcid, + clean_str, + clean_wikidata_qid, + parse_month, +) from fatcat_tools.transforms import entity_to_dict diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 833089ae..1831c4cd 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -4,15 +4,24 @@ Importer for DOAJ article-level metadata, schema v1. DOAJ API schema and docs: https://doaj.org/api/v1/docs """ -import warnings import datetime +import warnings from typing import List, Optional import fatcat_openapi_client -from fatcat_tools.normal import (clean_doi, clean_str, parse_month, - clean_orcid, detect_text_lang, parse_lang_name, parse_country_name, - clean_pmid, clean_pmcid) + from fatcat_tools.importers.common import EntityImporter +from fatcat_tools.normal import ( + clean_doi, + clean_orcid, + clean_pmcid, + clean_pmid, + clean_str, + detect_text_lang, + parse_country_name, + parse_lang_name, + parse_month, +) # Cutoff length for abstracts. MAX_ABSTRACT_LENGTH = 2048 diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py index 3d9f5923..0951ed84 100644 --- a/python/fatcat_tools/importers/file_meta.py +++ b/python/fatcat_tools/importers/file_meta.py @@ -1,5 +1,6 @@ import fatcat_openapi_client + from .common import EntityImporter diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py index 13352fb2..43c2a49c 100644 --- a/python/fatcat_tools/importers/fileset_generic.py +++ b/python/fatcat_tools/importers/fileset_generic.py @@ -2,6 +2,7 @@ import fatcat_openapi_client from fatcat_tools import entity_from_dict + from .common import EntityImporter diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index a811c856..0f666652 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -1,8 +1,10 @@ #!/usr/bin/env python3 -import json import base64 +import json + import fatcat_openapi_client + from .common import EntityImporter, clean, make_rel_url MAX_ABSTRACT_BYTES=4096 diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 4d4efc0a..f0943c1e 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -2,6 +2,7 @@ import datetime import fatcat_openapi_client + from .common import EntityImporter, make_rel_url diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 12f5450f..0a983c5e 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,12 +1,14 @@ -import sys -import sqlite3 import datetime -from bs4 import BeautifulSoup +import sqlite3 +import sys import fatcat_openapi_client +from bs4 import BeautifulSoup + from fatcat_tools.normal import clean_doi -from .common import EntityImporter, clean, is_cjk, DATE_FMT + +from .common import DATE_FMT, EntityImporter, clean, is_cjk def parse_jalc_persons(raw_persons): diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index 9f3b429f..25d7b3b5 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -1,5 +1,6 @@ import fatcat_openapi_client + from .common import EntityImporter, clean diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 5d35f5e2..d37424d6 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -1,12 +1,13 @@ -import sys -import json import datetime +import json +import sys import warnings -from bs4 import BeautifulSoup import fatcat_openapi_client -from .common import EntityImporter, clean, LANG_MAP_MARC +from bs4 import BeautifulSoup + +from .common import LANG_MAP_MARC, EntityImporter, clean from .crossref import CONTAINER_TYPE_MAP # TODO: more entries? diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 4412a46d..3bdd23a1 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -1,8 +1,11 @@ import sys + import fatcat_openapi_client + from .common import EntityImporter, clean + def value_or_none(e): if type(e) == dict: e = e.get('value') |