diff options
| -rw-r--r-- | fatcat_scholar/schema.py | 11 | ||||
| -rw-r--r-- | fatcat_scholar/sim_pipeline.py | 12 | ||||
| -rw-r--r-- | fatcat_scholar/transform.py | 51 | ||||
| -rw-r--r-- | fatcat_scholar/work_pipeline.py | 11 | ||||
| -rw-r--r-- | fatcat_scholar/worker.py | 11 | 
5 files changed, 74 insertions, 22 deletions
| diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 9b2d2fb..b93962c 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -380,8 +380,15 @@ def clean_str(raw: Optional[str], strip_trailing_period: bool = False) -> Option      text = ftfy.fix_text(raw) -    # remove HTML -    text = BeautifulSoup(text, "html.parser").get_text() +    # remove HTML tags +    try: +        # TODO: work_h4ufpvlh3rcefacajni7sdndwa as a regression test +        # TODO: consider w3clib "remove tags" as an alternative +        clean_text = BeautifulSoup(text, "html.parser").get_text() +        text = clean_text +    except UnboundLocalError: +        # TODO: passing through raw string; what should behavior actually be? +        pass      # TODO: for performance, compile these as globals?      # replaces whitespace with single space diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py index 621f1fc..e5e2a02 100644 --- a/fatcat_scholar/sim_pipeline.py +++ b/fatcat_scholar/sim_pipeline.py @@ -3,10 +3,13 @@ import sys  import sqlite3  import argparse  from typing import List, Dict, Optional, Any +import urllib3.exceptions  import requests +import sentry_sdk  import internetarchive +from fatcat_scholar.config import settings, GIT_REVISION  from fatcat_scholar.djvu import djvu_extract_leaf_texts  from fatcat_scholar.issue_db import IssueDB  from fatcat_scholar.schema import ( @@ -159,6 +162,7 @@ class SimPipeline:                  requests.exceptions.ConnectionError,                  requests.exceptions.Timeout,                  requests.exceptions.RetryError, +                urllib3.exceptions.MaxRetryError,              ) as e:                  print(str(e), file=sys.stderr)                  continue @@ -254,6 +258,14 @@ def main() -> None:          parser.print_help(file=sys.stderr)          sys.exit(-1) +    if settings.SENTRY_DSN: +        sentry_sdk.init( +            dsn=settings.SENTRY_DSN, +            environment=settings.SCHOLAR_ENV, +            max_breadcrumbs=10, +            release=GIT_REVISION, +        ) +      sp = SimPipeline(issue_db=IssueDB(args.issue_db_file))      if args.func == "run_issue_db": diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 4114885..cbf0a89 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -1,14 +1,16 @@  import sys  import argparse  import datetime +import xml.etree.ElementTree  import xml.etree.ElementTree as ET  from typing import List, Dict, Optional, Any, Sequence +import sentry_sdk  from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity  from fatcat_scholar.api_entities import *  from fatcat_scholar.schema import * -from fatcat_scholar.config import settings +from fatcat_scholar.config import settings, GIT_REVISION  from fatcat_scholar.grobid2json import teixml2json @@ -138,11 +140,10 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:      if release_year and abs(release_year) > 2050:          release_year = None -    lang_code = SIM_LANG_MAP.get(issue_meta.get("language")) or SIM_LANG_MAP.get( -        pub_meta.get("language") -    ) -    if isinstance(lang_code, list): -        lang_code = lang_code[0] +    language = issue_meta.get("language") or pub_meta.get("language") +    if isinstance(language, list): +        language = language[0] +    lang_code = SIM_LANG_MAP.get(language)      return ScholarBiblio(          # release_ident=release.ident, @@ -479,18 +480,22 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:              for f in fulltext_release.files              if f.ident == heavy.grobid_fulltext["file_ident"]          ][0] -        tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"]) -        if not abstracts: -            abstracts = es_abstracts_from_grobid(tei_dict) -        grobid_fulltext = es_fulltext_from_grobid( -            tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file -        ) -        if exclude_web_fulltext and grobid_fulltext: -            if not fulltext: -                # include only partial fulltext object, with no access -                fulltext = grobid_fulltext.remove_access() -        else: -            fulltext = grobid_fulltext +        try: +            tei_dict: Optional[dict] = teixml2json(heavy.grobid_fulltext["tei_xml"]) +        except xml.etree.ElementTree.ParseError: +            tei_dict = None +        if tei_dict: +            if not abstracts: +                abstracts = es_abstracts_from_grobid(tei_dict) +            grobid_fulltext = es_fulltext_from_grobid( +                tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file +            ) +            if exclude_web_fulltext and grobid_fulltext: +                if not fulltext: +                    # include only partial fulltext object, with no access +                    fulltext = grobid_fulltext.remove_access() +            else: +                fulltext = grobid_fulltext      if not fulltext and heavy.pdftotext_fulltext:          fulltext_release = [ @@ -799,6 +804,16 @@ def main() -> None:          parser.print_help(file=sys.stderr)          sys.exit(-1) +    # enable sentry exception catching; this helps a lot with debugging bulk +    # transform runs +    if settings.SENTRY_DSN: +        sentry_sdk.init( +            dsn=settings.SENTRY_DSN, +            environment=settings.SCHOLAR_ENV, +            max_breadcrumbs=10, +            release=GIT_REVISION, +        ) +      if args.func == "run_transform":          run_transform(infile=args.json_file)      elif args.func == "run_refs": diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index aef2064..cb96274 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -7,11 +7,12 @@ import urllib3.exceptions  import minio  import requests +import sentry_sdk  import internetarchive  from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity  from fatcat_scholar.api_entities import * -from fatcat_scholar.config import settings +from fatcat_scholar.config import settings, GIT_REVISION  from fatcat_scholar.djvu import djvu_extract_leaf_texts  from fatcat_scholar.sandcrawler import (      SandcrawlerPostgrestClient, @@ -469,6 +470,14 @@ def main() -> None:          parser.print_help(file=sys.stderr)          sys.exit(-1) +    if settings.SENTRY_DSN: +        sentry_sdk.init( +            dsn=settings.SENTRY_DSN, +            environment=settings.SCHOLAR_ENV, +            max_breadcrumbs=10, +            release=GIT_REVISION, +        ) +      wp = WorkPipeline(          issue_db=IssueDB(args.issue_db_file),          sandcrawler_db_client=SandcrawlerPostgrestClient( diff --git a/fatcat_scholar/worker.py b/fatcat_scholar/worker.py index fb806a7..854c1a2 100644 --- a/fatcat_scholar/worker.py +++ b/fatcat_scholar/worker.py @@ -6,12 +6,13 @@ import datetime  from typing import List, Any  import requests +import sentry_sdk  import elasticsearch  import elasticsearch.helpers  import fatcat_openapi_client  from fatcat_openapi_client import ReleaseEntity -from fatcat_scholar.config import settings +from fatcat_scholar.config import settings, GIT_REVISION  from fatcat_scholar.issue_db import IssueDB  from fatcat_scholar.sandcrawler import (      SandcrawlerPostgrestClient, @@ -198,6 +199,14 @@ def main() -> None:          parser.print_help(file=sys.stderr)          sys.exit(-1) +    if settings.SENTRY_DSN: +        sentry_sdk.init( +            dsn=settings.SENTRY_DSN, +            environment=settings.SCHOLAR_ENV, +            max_breadcrumbs=10, +            release=GIT_REVISION, +        ) +      if args.worker == "fetch-docs-worker":          issue_db = IssueDB(args.issue_db_file)          wp = WorkPipeline( | 
