diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-03 12:57:32 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-03 16:46:07 -0700 |
commit | caf1cb316ed18820f3239a285ef14bf45ef963a2 (patch) | |
tree | 2d3713773dac769878154f61c2eb9f7804f1a60c /python/fatcat_tools/importers/arxiv.py | |
parent | 10a2374051568edf3d872988e730328d899a0fdd (diff) | |
download | fatcat-caf1cb316ed18820f3239a285ef14bf45ef963a2.tar.gz fatcat-caf1cb316ed18820f3239a285ef14bf45ef963a2.zip |
typing: initial annotations on importers
This commit just adds the type annotations, doesn't do fixes to code to
make type checking pass.
Diffstat (limited to 'python/fatcat_tools/importers/arxiv.py')
-rw-r--r-- | python/fatcat_tools/importers/arxiv.py | 28 |
1 files changed, 16 insertions, 12 deletions
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 0957db2c..1d50dd9a 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -2,9 +2,11 @@ import datetime import json import re import sys +from typing import Any, Dict, List, Optional import fatcat_openapi_client from bs4 import BeautifulSoup +from fatcat_openapi_client import ApiClient, ReleaseEntity from pylatexenc.latex2text import LatexNodes2Text from .common import EntityImporter @@ -13,7 +15,7 @@ from .crossref import lookup_license_slug latex2text = LatexNodes2Text() -def latex_to_text(raw): +def latex_to_text(raw: str) -> str: try: return latex2text.latex_to_text(raw).strip() except AttributeError: @@ -22,7 +24,7 @@ def latex_to_text(raw): return raw.strip() -def parse_arxiv_authors(raw): +def parse_arxiv_authors(raw: str) -> List[str]: if not raw: return [] raw = raw.replace("*", "") @@ -41,7 +43,7 @@ def parse_arxiv_authors(raw): return authors -def test_parse_arxiv_authors(): +def test_parse_arxiv_authors() -> None: assert parse_arxiv_authors( "Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an" @@ -88,7 +90,7 @@ class ArxivRawImporter(EntityImporter): the "most recent" version; can be a simple sort? """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -107,15 +109,17 @@ class ArxivRawImporter(EntityImporter): ) self._test_override = False - def parse_record(self, record): + # TODO: record is really a beautiful soup element, but setting to 'Any' to + # make initial type annotations simple + def parse_record(self, record: Any) -> Optional[List[ReleaseEntity]]: if not record: return None metadata = record.arXivRaw if not metadata: return None - extra = dict() - extra_arxiv = dict() + extra: Dict[str, Any] = dict() + extra_arxiv: Dict[str, Any] = dict() # don't know! release_type = "article" @@ -134,7 +138,7 @@ class ArxivRawImporter(EntityImporter): for i, a in enumerate(authors) ] - lang = "en" # the vast majority in english + lang: Optional[str] = "en" # the vast majority in english if metadata.comments and metadata.comments.get_text(): comments = metadata.comments.get_text().replace("\n", " ").strip() extra_arxiv["comments"] = comments @@ -229,7 +233,7 @@ class ArxivRawImporter(EntityImporter): ).date() # TODO: source_type? versions.append( - fatcat_openapi_client.ReleaseEntity( + ReleaseEntity( work_id=None, title=title, # original_title @@ -261,7 +265,7 @@ class ArxivRawImporter(EntityImporter): versions[-1].release_stage = "accepted" return versions - def try_update(self, versions): + def try_update(self, versions: List[ReleaseEntity]) -> bool: """ This is pretty complex! There is no batch/bezerk mode for arxiv importer. @@ -344,7 +348,7 @@ class ArxivRawImporter(EntityImporter): return False - def insert_batch(self, batch_batch): + def insert_batch(self, batch_batch: List[ReleaseEntity]) -> None: # there is no batch/bezerk mode for arxiv importer, except for testing if self._test_override: for batch in batch_batch: @@ -360,7 +364,7 @@ class ArxivRawImporter(EntityImporter): else: raise NotImplementedError() - def parse_file(self, handle): + def parse_file(self, handle: Any) -> None: # 1. open with beautiful soup soup = BeautifulSoup(handle, "xml") |