summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/schema.py11
-rw-r--r--fatcat_scholar/sim_pipeline.py12
-rw-r--r--fatcat_scholar/transform.py51
-rw-r--r--fatcat_scholar/work_pipeline.py11
-rw-r--r--fatcat_scholar/worker.py11
5 files changed, 74 insertions, 22 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 9b2d2fb..b93962c 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -380,8 +380,15 @@ def clean_str(raw: Optional[str], strip_trailing_period: bool = False) -> Option
text = ftfy.fix_text(raw)
- # remove HTML
- text = BeautifulSoup(text, "html.parser").get_text()
+ # remove HTML tags
+ try:
+ # TODO: work_h4ufpvlh3rcefacajni7sdndwa as a regression test
+ # TODO: consider w3clib "remove tags" as an alternative
+ clean_text = BeautifulSoup(text, "html.parser").get_text()
+ text = clean_text
+ except UnboundLocalError:
+ # TODO: passing through raw string; what should behavior actually be?
+ pass
# TODO: for performance, compile these as globals?
# replaces whitespace with single space
diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py
index 621f1fc..e5e2a02 100644
--- a/fatcat_scholar/sim_pipeline.py
+++ b/fatcat_scholar/sim_pipeline.py
@@ -3,10 +3,13 @@ import sys
import sqlite3
import argparse
from typing import List, Dict, Optional, Any
+import urllib3.exceptions
import requests
+import sentry_sdk
import internetarchive
+from fatcat_scholar.config import settings, GIT_REVISION
from fatcat_scholar.djvu import djvu_extract_leaf_texts
from fatcat_scholar.issue_db import IssueDB
from fatcat_scholar.schema import (
@@ -159,6 +162,7 @@ class SimPipeline:
requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
requests.exceptions.RetryError,
+ urllib3.exceptions.MaxRetryError,
) as e:
print(str(e), file=sys.stderr)
continue
@@ -254,6 +258,14 @@ def main() -> None:
parser.print_help(file=sys.stderr)
sys.exit(-1)
+ if settings.SENTRY_DSN:
+ sentry_sdk.init(
+ dsn=settings.SENTRY_DSN,
+ environment=settings.SCHOLAR_ENV,
+ max_breadcrumbs=10,
+ release=GIT_REVISION,
+ )
+
sp = SimPipeline(issue_db=IssueDB(args.issue_db_file))
if args.func == "run_issue_db":
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 4114885..cbf0a89 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -1,14 +1,16 @@
import sys
import argparse
import datetime
+import xml.etree.ElementTree
import xml.etree.ElementTree as ET
from typing import List, Dict, Optional, Any, Sequence
+import sentry_sdk
from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity
from fatcat_scholar.api_entities import *
from fatcat_scholar.schema import *
-from fatcat_scholar.config import settings
+from fatcat_scholar.config import settings, GIT_REVISION
from fatcat_scholar.grobid2json import teixml2json
@@ -138,11 +140,10 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
if release_year and abs(release_year) > 2050:
release_year = None
- lang_code = SIM_LANG_MAP.get(issue_meta.get("language")) or SIM_LANG_MAP.get(
- pub_meta.get("language")
- )
- if isinstance(lang_code, list):
- lang_code = lang_code[0]
+ language = issue_meta.get("language") or pub_meta.get("language")
+ if isinstance(language, list):
+ language = language[0]
+ lang_code = SIM_LANG_MAP.get(language)
return ScholarBiblio(
# release_ident=release.ident,
@@ -479,18 +480,22 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
for f in fulltext_release.files
if f.ident == heavy.grobid_fulltext["file_ident"]
][0]
- tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"])
- if not abstracts:
- abstracts = es_abstracts_from_grobid(tei_dict)
- grobid_fulltext = es_fulltext_from_grobid(
- tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
- )
- if exclude_web_fulltext and grobid_fulltext:
- if not fulltext:
- # include only partial fulltext object, with no access
- fulltext = grobid_fulltext.remove_access()
- else:
- fulltext = grobid_fulltext
+ try:
+ tei_dict: Optional[dict] = teixml2json(heavy.grobid_fulltext["tei_xml"])
+ except xml.etree.ElementTree.ParseError:
+ tei_dict = None
+ if tei_dict:
+ if not abstracts:
+ abstracts = es_abstracts_from_grobid(tei_dict)
+ grobid_fulltext = es_fulltext_from_grobid(
+ tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
+ )
+ if exclude_web_fulltext and grobid_fulltext:
+ if not fulltext:
+ # include only partial fulltext object, with no access
+ fulltext = grobid_fulltext.remove_access()
+ else:
+ fulltext = grobid_fulltext
if not fulltext and heavy.pdftotext_fulltext:
fulltext_release = [
@@ -799,6 +804,16 @@ def main() -> None:
parser.print_help(file=sys.stderr)
sys.exit(-1)
+ # enable sentry exception catching; this helps a lot with debugging bulk
+ # transform runs
+ if settings.SENTRY_DSN:
+ sentry_sdk.init(
+ dsn=settings.SENTRY_DSN,
+ environment=settings.SCHOLAR_ENV,
+ max_breadcrumbs=10,
+ release=GIT_REVISION,
+ )
+
if args.func == "run_transform":
run_transform(infile=args.json_file)
elif args.func == "run_refs":
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index aef2064..cb96274 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -7,11 +7,12 @@ import urllib3.exceptions
import minio
import requests
+import sentry_sdk
import internetarchive
from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity
from fatcat_scholar.api_entities import *
-from fatcat_scholar.config import settings
+from fatcat_scholar.config import settings, GIT_REVISION
from fatcat_scholar.djvu import djvu_extract_leaf_texts
from fatcat_scholar.sandcrawler import (
SandcrawlerPostgrestClient,
@@ -469,6 +470,14 @@ def main() -> None:
parser.print_help(file=sys.stderr)
sys.exit(-1)
+ if settings.SENTRY_DSN:
+ sentry_sdk.init(
+ dsn=settings.SENTRY_DSN,
+ environment=settings.SCHOLAR_ENV,
+ max_breadcrumbs=10,
+ release=GIT_REVISION,
+ )
+
wp = WorkPipeline(
issue_db=IssueDB(args.issue_db_file),
sandcrawler_db_client=SandcrawlerPostgrestClient(
diff --git a/fatcat_scholar/worker.py b/fatcat_scholar/worker.py
index fb806a7..854c1a2 100644
--- a/fatcat_scholar/worker.py
+++ b/fatcat_scholar/worker.py
@@ -6,12 +6,13 @@ import datetime
from typing import List, Any
import requests
+import sentry_sdk
import elasticsearch
import elasticsearch.helpers
import fatcat_openapi_client
from fatcat_openapi_client import ReleaseEntity
-from fatcat_scholar.config import settings
+from fatcat_scholar.config import settings, GIT_REVISION
from fatcat_scholar.issue_db import IssueDB
from fatcat_scholar.sandcrawler import (
SandcrawlerPostgrestClient,
@@ -198,6 +199,14 @@ def main() -> None:
parser.print_help(file=sys.stderr)
sys.exit(-1)
+ if settings.SENTRY_DSN:
+ sentry_sdk.init(
+ dsn=settings.SENTRY_DSN,
+ environment=settings.SCHOLAR_ENV,
+ max_breadcrumbs=10,
+ release=GIT_REVISION,
+ )
+
if args.worker == "fetch-docs-worker":
issue_db = IssueDB(args.issue_db_file)
wp = WorkPipeline(