diff options
Diffstat (limited to 'fatcat_scholar')
-rw-r--r-- | fatcat_scholar/api_entities.py | 2 | ||||
-rw-r--r-- | fatcat_scholar/config.py | 1 | ||||
-rw-r--r-- | fatcat_scholar/djvu.py | 4 | ||||
-rwxr-xr-x | fatcat_scholar/grobid2json.py | 4 | ||||
-rw-r--r-- | fatcat_scholar/hacks.py | 2 | ||||
-rw-r--r-- | fatcat_scholar/issue_db.py | 9 | ||||
-rw-r--r-- | fatcat_scholar/kafka.py | 6 | ||||
-rw-r--r-- | fatcat_scholar/query_citation.py | 11 | ||||
-rw-r--r-- | fatcat_scholar/query_fatcat.py | 10 | ||||
-rw-r--r-- | fatcat_scholar/sandcrawler.py | 3 | ||||
-rw-r--r-- | fatcat_scholar/schema.py | 13 | ||||
-rw-r--r-- | fatcat_scholar/search.py | 19 | ||||
-rw-r--r-- | fatcat_scholar/sim_pipeline.py | 17 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 10 | ||||
-rw-r--r-- | fatcat_scholar/web.py | 31 | ||||
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 25 | ||||
-rw-r--r-- | fatcat_scholar/worker.py | 18 |
17 files changed, 92 insertions, 93 deletions
diff --git a/fatcat_scholar/api_entities.py b/fatcat_scholar/api_entities.py index 605a7ac..0664718 100644 --- a/fatcat_scholar/api_entities.py +++ b/fatcat_scholar/api_entities.py @@ -1,5 +1,5 @@ -import json import collections +import json from typing import Any, Optional from fatcat_openapi_client import ApiClient diff --git a/fatcat_scholar/config.py b/fatcat_scholar/config.py index 2619df0..ddb2844 100644 --- a/fatcat_scholar/config.py +++ b/fatcat_scholar/config.py @@ -1,4 +1,5 @@ import subprocess + from dynaconf import Dynaconf settings = Dynaconf(settings_file="settings.toml", environments=True,) diff --git a/fatcat_scholar/djvu.py b/fatcat_scholar/djvu.py index 3df61dd..58d6761 100644 --- a/fatcat_scholar/djvu.py +++ b/fatcat_scholar/djvu.py @@ -1,6 +1,6 @@ -from io import StringIO -from typing import List, Dict, Optional import xml.etree.ElementTree as ET +from io import StringIO +from typing import Dict, List, Optional def djvu_extract_leaf_texts( diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py index 5c44953..c99b9ed 100755 --- a/fatcat_scholar/grobid2json.py +++ b/fatcat_scholar/grobid2json.py @@ -25,11 +25,11 @@ Prints JSON to stdout, errors to stderr This file copied from the sandcrawler repository. """ +import argparse import io import json -import argparse import xml.etree.ElementTree as ET -from typing import List, Any, Dict, AnyStr, Optional +from typing import Any, AnyStr, Dict, List, Optional xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" diff --git a/fatcat_scholar/hacks.py b/fatcat_scholar/hacks.py index 29274c9..541edda 100644 --- a/fatcat_scholar/hacks.py +++ b/fatcat_scholar/hacks.py @@ -1,6 +1,6 @@ import typing -import jinja2 +import jinja2 from starlette.background import BackgroundTask from starlette.templating import _TemplateResponse diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py index 3c55b51..3ca6b79 100644 --- a/fatcat_scholar/issue_db.py +++ b/fatcat_scholar/issue_db.py @@ -1,11 +1,12 @@ -import sys +import argparse import json import sqlite3 -import argparse +import sys from dataclasses import dataclass -from typing import List, Dict, Optional, Any, Sequence, Tuple -import fatcat_openapi_client +from typing import Any, Dict, List, Optional, Sequence, Tuple + import elasticsearch +import fatcat_openapi_client from elasticsearch_dsl import Search from fatcat_scholar.config import settings diff --git a/fatcat_scholar/kafka.py b/fatcat_scholar/kafka.py index f412f8a..71067c1 100644 --- a/fatcat_scholar/kafka.py +++ b/fatcat_scholar/kafka.py @@ -1,10 +1,10 @@ -import sys import json import signal +import sys from collections import Counter -from typing import List, Any +from typing import Any, List -from confluent_kafka import Consumer, Producer, KafkaException +from confluent_kafka import Consumer, KafkaException, Producer class KafkaWorker: diff --git a/fatcat_scholar/query_citation.py b/fatcat_scholar/query_citation.py index 4f40c73..3f741f0 100644 --- a/fatcat_scholar/query_citation.py +++ b/fatcat_scholar/query_citation.py @@ -12,16 +12,16 @@ parallel with "regular" query? import io import sys -from typing import Optional, Any, Tuple import xml.etree.ElementTree as ET +from typing import Any, Optional, Tuple -import requests -from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds, ReleaseContrib -from fatcat_scholar.api_entities import entity_to_dict -from fuzzycat.matching import match_release_fuzzy import fuzzycat.common import fuzzycat.verify +import requests +from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds +from fuzzycat.matching import match_release_fuzzy +from fatcat_scholar.api_entities import entity_to_dict from fatcat_scholar.grobid2json import biblio_info @@ -172,6 +172,7 @@ if __name__ == "__main__": Demo showing how to integrate the above functions together. """ import os + import elasticsearch import fatcat_openapi_client diff --git a/fatcat_scholar/query_fatcat.py b/fatcat_scholar/query_fatcat.py index edac35d..45c7e47 100644 --- a/fatcat_scholar/query_fatcat.py +++ b/fatcat_scholar/query_fatcat.py @@ -1,14 +1,14 @@ +import argparse +import json import os import sys -import json -import argparse -from typing import List, Any +from typing import Any, List +import elasticsearch import requests +from elasticsearch_dsl import Q, Search from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error -import elasticsearch -from elasticsearch_dsl import Search, Q def requests_retry_session( diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py index 0501f8e..9f9f7e4 100644 --- a/fatcat_scholar/sandcrawler.py +++ b/fatcat_scholar/sandcrawler.py @@ -1,6 +1,7 @@ +from typing import Any, Dict, Optional + import minio import requests -from typing import Dict, Optional, Any class SandcrawlerPostgrestClient: diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 0fcf56e..4230b7e 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -4,23 +4,24 @@ get serialization for free with those. This is useful for things like auto-conversion of datetime objects. """ -import re -import json import datetime +import json +import re from enum import Enum -from typing import Optional, List, Any, Dict +from typing import Any, Dict, List, Optional import ftfy from bs4 import BeautifulSoup +from fatcat_openapi_client import ReleaseContrib, ReleaseEntity # pytype: disable=import-error from pydantic import BaseModel +from fatcat_scholar.api_entities import entity_from_json, entity_to_dict +from fatcat_scholar.biblio_hacks import doi_link_domain + # pytype: enable=import-error -from fatcat_openapi_client import ReleaseEntity, ReleaseContrib -from fatcat_scholar.api_entities import entity_to_dict, entity_from_json -from fatcat_scholar.biblio_hacks import doi_link_domain class DocType(str, Enum): diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index dccaf07..6522fe3 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -3,27 +3,28 @@ Helpers to make elasticsearch queries. """ import copy -import logging import datetime +import logging from gettext import gettext -from typing import List, Optional, Any +from typing import Any, List, Optional -import sentry_sdk import elasticsearch -from elasticsearch_dsl import Search, Q -from elasticsearch_dsl.response import Response import fatcat_openapi_client +import sentry_sdk +from elasticsearch_dsl import Q, Search +from elasticsearch_dsl.response import Response # pytype: disable=import-error from pydantic import BaseModel -# pytype: enable=import-error - from fatcat_scholar.config import settings from fatcat_scholar.identifiers import * -from fatcat_scholar.schema import ScholarDoc -from fatcat_scholar.query_parse import sniff_citation_query, pre_parse_query from fatcat_scholar.query_citation import try_fuzzy_match +from fatcat_scholar.query_parse import pre_parse_query, sniff_citation_query +from fatcat_scholar.schema import ScholarDoc + +# pytype: enable=import-error + # i18n note: the use of gettext below doesn't actually do the translation here, # it just ensures that the strings are caught by babel for translation later diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py index aa6f4da..9157852 100644 --- a/fatcat_scholar/sim_pipeline.py +++ b/fatcat_scholar/sim_pipeline.py @@ -1,21 +1,18 @@ +import argparse import io -import sys import sqlite3 -import argparse -from typing import List, Dict, Optional, Any -import urllib3.exceptions +import sys +from typing import Any, Dict, List, Optional +import internetarchive import requests import sentry_sdk -import internetarchive +import urllib3.exceptions -from fatcat_scholar.config import settings, GIT_REVISION +from fatcat_scholar.config import GIT_REVISION, settings from fatcat_scholar.djvu import djvu_extract_leaf_texts from fatcat_scholar.issue_db import IssueDB -from fatcat_scholar.schema import ( - DocType, - IntermediateBundle, -) +from fatcat_scholar.schema import DocType, IntermediateBundle def truncate_pub_meta(full: Dict[str, Any]) -> Dict[str, Any]: diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 3a7102a..7264540 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -1,18 +1,18 @@ -import sys import argparse import datetime +import sys import xml.etree.ElementTree import xml.etree.ElementTree as ET -from typing import List, Dict, Optional, Any, Sequence +from typing import Any, Dict, List, Optional, Sequence import sentry_sdk -from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity +from fatcat_openapi_client import FileEntity, ReleaseEntity, WebcaptureEntity from fatcat_scholar.api_entities import * -from fatcat_scholar.schema import * -from fatcat_scholar.config import settings, GIT_REVISION +from fatcat_scholar.config import GIT_REVISION, settings from fatcat_scholar.grobid2json import teixml2json from fatcat_scholar.identifiers import clean_doi, clean_pmcid +from fatcat_scholar.schema import * MAX_BODY_CHARS = 512 * 1024 diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index 8cf2c88..e835c01 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -6,41 +6,40 @@ So far there are few endpoints, so we just put them all here! import logging import urllib.parse -from typing import Optional, Any, List, Dict +from typing import Any, Dict, List, Optional -from pydantic import BaseModel import babel.numbers import babel.support -from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException, Query -from fastapi.staticfiles import StaticFiles +import fatcat_openapi_client +import sentry_sdk +from fastapi import APIRouter, Depends, FastAPI, HTTPException, Query, Request, Response +from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import ( - PlainTextResponse, - JSONResponse, FileResponse, + JSONResponse, + PlainTextResponse, RedirectResponse, ) -from fastapi.middleware.cors import CORSMiddleware -import fatcat_openapi_client -import sentry_sdk +from fastapi.staticfiles import StaticFiles +from pydantic import BaseModel from sentry_sdk.integrations.asgi import SentryAsgiMiddleware -from starlette_prometheus import metrics, PrometheusMiddleware from starlette.exceptions import HTTPException as StarletteHTTPException +from starlette_prometheus import PrometheusMiddleware, metrics -from fatcat_scholar.config import settings, GIT_REVISION +from fatcat_scholar.config import GIT_REVISION, settings from fatcat_scholar.hacks import ( Jinja2Templates, - parse_accept_lang, make_access_redirect_url, + parse_accept_lang, ) +from fatcat_scholar.schema import ScholarDoc from fatcat_scholar.search import ( - process_query, - FulltextQuery, FulltextHits, + FulltextQuery, es_scholar_index_alive, get_es_scholar_doc, + process_query, ) -from fatcat_scholar.schema import ScholarDoc - logger = logging.getLogger() diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index b9c8d33..54e1ed3 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -1,29 +1,26 @@ -import os +import argparse import io +import os import sys -import argparse -from typing import List, Dict, Tuple, Optional, Any, Sequence -import urllib3.exceptions +from typing import Any, Dict, List, Optional, Sequence, Tuple +import internetarchive import minio import requests import sentry_sdk -import internetarchive -from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity +import urllib3.exceptions +from fatcat_openapi_client import FileEntity, ReleaseEntity, WebcaptureEntity from fatcat_scholar.api_entities import * -from fatcat_scholar.config import settings, GIT_REVISION +from fatcat_scholar.config import GIT_REVISION, settings from fatcat_scholar.djvu import djvu_extract_leaf_texts +from fatcat_scholar.issue_db import IssueDB, SimIssueRow, SimPubRow from fatcat_scholar.sandcrawler import ( - SandcrawlerPostgrestClient, SandcrawlerMinioClient, + SandcrawlerPostgrestClient, ) -from fatcat_scholar.issue_db import IssueDB, SimIssueRow, SimPubRow -from fatcat_scholar.schema import ( - DocType, - IntermediateBundle, -) -from fatcat_scholar.sim_pipeline import truncate_pub_meta, truncate_issue_meta +from fatcat_scholar.schema import DocType, IntermediateBundle +from fatcat_scholar.sim_pipeline import truncate_issue_meta, truncate_pub_meta def parse_pages(raw: str) -> Tuple[Optional[int], Optional[int]]: diff --git a/fatcat_scholar/worker.py b/fatcat_scholar/worker.py index 7d2b3d6..b836d7c 100644 --- a/fatcat_scholar/worker.py +++ b/fatcat_scholar/worker.py @@ -1,26 +1,26 @@ -import os -import sys import argparse import datetime -from typing import List, Any +import os +import sys +from typing import Any, List -import requests -import sentry_sdk import elasticsearch import elasticsearch.helpers import fatcat_openapi_client +import requests +import sentry_sdk -from fatcat_scholar.config import settings, GIT_REVISION +from fatcat_scholar.config import GIT_REVISION, settings from fatcat_scholar.issue_db import IssueDB +from fatcat_scholar.kafka import KafkaWorker from fatcat_scholar.sandcrawler import ( - SandcrawlerPostgrestClient, SandcrawlerMinioClient, + SandcrawlerPostgrestClient, ) from fatcat_scholar.schema import IntermediateBundle +from fatcat_scholar.sim_pipeline import SimPipeline from fatcat_scholar.transform import transform_heavy from fatcat_scholar.work_pipeline import WorkPipeline -from fatcat_scholar.sim_pipeline import SimPipeline -from fatcat_scholar.kafka import KafkaWorker class FetchDocsWorker(KafkaWorker): |