Merge branch 'bnewbold-doaj-metadata' into 'master'

DOAJ article metadata import See merge request webgroup/fatcat!89
author: Martin Czygan <martin@archive.org> 2020-11-24 19:29:07 +0000
committer: Martin Czygan <martin@archive.org> 2020-11-24 19:29:07 +0000
commit: cfd13852d7cb58fcc3387373960adaf3680f0faf (patch)
tree: 675954b8b34324fe22fc5a00f3fbb99a21a77a21 /python/fatcat_tools
parent: fcfcd3224a113fa90da2045a3c7fe90127088ebe (diff)
parent: 1fca5a9822944d0646d2dcba6cf54f27a0ffe5c0 (diff)
download: fatcat-cfd13852d7cb58fcc3387373960adaf3680f0faf.tar.gz
fatcat-cfd13852d7cb58fcc3387373960adaf3680f0faf.zip
7 files changed, 697 insertions, 159 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index c08e04c2..d2928d09 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -30,3 +30,4 @@ from .cdl_dash_dat import auto_cdl_dash_dat
 from .ingest import IngestFileResultImporter, SavePaperNowFileImporter, IngestWebResultImporter
 from .shadow import ShadowLibraryImporter
 from .file_meta import FileMetaImporter
+from .doaj_article import DoajArticleImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 14415683..3c810391 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -3,12 +3,9 @@ import re
 import sys
 import csv
 import json
-import ftfy
-import base64
 import sqlite3
 import datetime
 import subprocess
-import unicodedata
 from collections import Counter
 from confluent_kafka import Consumer, KafkaException
 import xml.etree.ElementTree as ET
@@ -18,162 +15,13 @@ from bs4 import BeautifulSoup
 import fatcat_openapi_client
 from fatcat_openapi_client.rest import ApiException
 
+# TODO: refactor so remove need for this (re-imports for backwards compatibility)
+from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401
 
 DATE_FMT = "%Y-%m-%d"
 SANE_MAX_RELEASES = 200
 SANE_MAX_URLS = 100
 
-# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
-# 2/T and 2/B?
-# PubMed/MEDLINE and JSTOR use these MARC codes
-# https://www.loc.gov/marc/languages/language_name.html
-LANG_MAP_MARC = {
-    'afr': 'af',
-    'alb': 'sq',
-    'amh': 'am',
-    'ara': 'ar',
-    'arm': 'hy',
-    'aze': 'az',
-    'ben': 'bn',
-    'bos': 'bs',
-    'bul': 'bg',
-    'cat': 'ca',
-    'chi': 'zh',
-    'cze': 'cs',
-    'dan': 'da',
-    'dut': 'nl',
-    'eng': 'en',
-    'epo': 'eo',
-    'est': 'et',
-    'fin': 'fi',
-    'fre': 'fr',
-    'geo': 'ka',
-    'ger': 'de',
-    'gla': 'gd',
-    'gre': 'el',
-    'heb': 'he',
-    'hin': 'hi',
-    'hrv': 'hr',
-    'hun': 'hu',
-    'ice': 'is',
-    'ind': 'id',
-    'ita': 'it',
-    'jpn': 'ja',
-    'kin': 'rw',
-    'kor': 'ko',
-    'lat': 'la',
-    'lav': 'lv',
-    'lit': 'lt',
-    'mac': 'mk',
-    'mal': 'ml',
-    'mao': 'mi',
-    'may': 'ms',
-    'nor': 'no',
-    'per': 'fa',
-    'per': 'fa',
-    'pol': 'pl',
-    'por': 'pt',
-    'pus': 'ps',
-    'rum': 'ro',
-    'rus': 'ru',
-    'san': 'sa',
-    'slo': 'sk',
-    'slv': 'sl',
-    'spa': 'es',
-    'srp': 'sr',
-    'swe': 'sv',
-    'tha': 'th',
-    'tur': 'tr',
-    'ukr': 'uk',
-    'urd': 'ur',
-    'vie': 'vi',
-    'wel': 'cy',
-
-# additions
-    'gle': 'ga', # "Irish" (Gaelic)
-    'jav': 'jv', # Javanese
-    'welsh': 'cy', # Welsh
-    'oci': 'oc', # Occitan
-
-# Don't have ISO 639-1 codes
-    'grc': 'el', # Ancient Greek; map to modern greek
-    'map': None, # Austronesian (collection)
-    'syr': None, # Syriac, Modern
-    'gem': None, # Old Saxon
-    'non': None, # Old Norse
-    'emg': None, # Eastern Meohang
-    'neg': None, # Negidal
-    'mul': None, # Multiple languages
-    'und': None, # Undetermined
-}
-
-
-def clean(thing, force_xml=False):
-    """
-    This function is appropriate to be called on any random, non-markup string,
-    such as author names, titles, etc.
-
-    It will try to clean up common unicode mangles, HTML characters, etc.
-
-    This will detect XML/HTML and "do the right thing" (aka, not remove
-    entities like '&amp' if there are tags in the string), unless you pass the
-    'force_xml' parameter, which might be appropriate for, eg, names and
-    titles, which generally should be projected down to plain text.
-
-    Also strips extra whitespace.
-    """
-    if not thing:
-        return None
-    fix_entities = 'auto'
-    if force_xml:
-        fix_entities = True
-    fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip()
-    if not fixed or len(fixed) <= 1:
-        # wasn't zero-length before, but is now; return None
-        return None
-    return fixed
-
-def test_clean():
-
-    assert clean(None) == None
-    assert clean('') == None
-    assert clean('1') == None
-    assert clean('123') == '123'
-    assert clean('a&amp;b') == 'a&b'
-    assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
-    assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
-
-def b32_hex(s):
-    s = s.strip().split()[0].lower()
-    if s.startswith("sha1:"):
-        s = s[5:]
-    if len(s) != 32:
-        return s
-    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
-
-def is_cjk(s):
-    if not s:
-        return False
-    for c in s:
-        if c.isalpha():
-            lang_prefix = unicodedata.name(c).split()[0]
-            return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL')
-    return False
-
-def test_is_cjk():
-    assert is_cjk(None) is False
-    assert is_cjk('') is False
-    assert is_cjk('blah') is False
-    assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True
-    assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True
-    assert is_cjk('菊') is True
-    assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True
-    assert is_cjk('水道') is True
-    assert is_cjk('オウ, イク') is True # kanji
-    assert is_cjk('ひヒ') is True
-    assert is_cjk('き゚ゅ') is True
-    assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True
-
 DOMAIN_REL_MAP = {
     "archive.org": "archive",
     # LOCKSS, Portico, DuraSpace, etc would also be "archive"
@@ -444,6 +292,7 @@ class EntityImporter:
         raise NotImplementedError
 
     def is_orcid(self, orcid):
+        # TODO: replace with clean_orcid() from fatcat_tools.normal
         return self._orcid_regex.match(orcid) is not None
 
     def lookup_orcid(self, orcid):
@@ -464,6 +313,7 @@ class EntityImporter:
         return creator_id
 
     def is_doi(self, doi):
+        # TODO: replace with clean_doi() from fatcat_tools.normal
         return doi.startswith("10.") and doi.count("/") >= 1
 
     def lookup_doi(self, doi):
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 71f08952..e77fa65e 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -487,8 +487,6 @@ class CrossrefImporter(EntityImporter):
         except fatcat_openapi_client.rest.ApiException as err:
             if err.status != 404:
                 raise err
-            # doesn't exist, need to update
-            return True
 
         # eventually we'll want to support "updates", but for now just skip if
         # entity already exists
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 5cdc5577..70f8db86 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -781,8 +781,6 @@ class DataciteImporter(EntityImporter):
         except fatcat_openapi_client.rest.ApiException as err:
             if err.status != 404:
                 raise err
-            # doesn't exist, need to update
-            return True
 
         # eventually we'll want to support "updates", but for now just skip if
         # entity already exists
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
new file mode 100644
index 00000000..03752484
--- /dev/null
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -0,0 +1,358 @@
+"""
+Importer for DOAJ article-level metadata, schema v1.
+
+DOAJ API schema and docs: https://doaj.org/api/v1/docs
+"""
+
+import warnings
+import datetime
+from typing import List, Optional
+
+import fatcat_openapi_client
+from fatcat_tools.normal import (clean_doi, clean_str, parse_month,
+    clean_orcid, detect_text_lang, parse_lang_name, parse_country_name,
+    clean_pmid, clean_pmcid)
+from fatcat_tools.importers.common import EntityImporter
+
+# Cutoff length for abstracts.
+MAX_ABSTRACT_LENGTH = 2048
+
+
+class DoajArticleImporter(EntityImporter):
+
+    def __init__(self,
+                 api,
+                 issn_map_file,
+                 **kwargs):
+
+        eg_desc = kwargs.get(
+            'editgroup_description',
+            "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps"
+        )
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent',
+                                         'fatcat_tools.DoajArticleImporter')
+        # ensure default is to not do updates with this worker (override super() default)
+        kwargs['do_updates'] = kwargs.get("do_updates", False)
+        super().__init__(api,
+                         issn_map_file=issn_map_file,
+                         editgroup_description=eg_desc,
+                         editgroup_extra=eg_extra,
+                         **kwargs)
+
+        self.this_year = datetime.datetime.now().year
+        self.read_issn_map_file(issn_map_file)
+
+    def want(self, obj):
+        return True
+
+    def parse_record(self, obj):
+        """
+        bibjson {
+            abstract (string, optional),
+            author (Array[bibjson.author], optional),
+            identifier (Array[bibjson.identifier]),
+            journal (bibjson.journal, optional),
+            keywords (Array[string], optional),
+            link (Array[bibjson.link], optional),
+            month (string, optional),
+            subject (Array[bibjson.subject], optional),
+            title (string),
+            year (string, optional)
+        }
+        bibjson.journal {
+            country (string, optional),
+            end_page (string, optional),
+            language (Array[string], optional),
+            license (Array[bibjson.journal.license], optional),
+            number (string, optional),
+            publisher (string, optional),
+            start_page (string, optional),
+            title (string, optional),
+            volume (string, optional)
+        }
+        """
+
+        if not obj or not isinstance(obj, dict) or not 'bibjson' in obj:
+            self.counts['skip-empty'] += 1
+            return None
+
+        bibjson = obj['bibjson']
+
+        title = clean_str(bibjson.get('title'), force_xml=True)
+        if not title:
+            self.counts['skip-title'] += 1
+            return False
+
+        container_name = clean_str(bibjson['journal']['title'])
+        container_id = None
+        # NOTE: 'issns' not documented in API schema
+        for issn in bibjson['journal']['issns']:
+            issnl = self.issn2issnl(issn)
+            if issnl:
+                container_id = self.lookup_issnl(self.issn2issnl(issn))
+            if container_id:
+                # don't store container_name when we have an exact match
+                container_name = None
+                break
+
+        volume = clean_str(bibjson['journal'].get('volume'))
+        # NOTE: this schema seems to use "number" as "issue number"
+        issue = clean_str(bibjson['journal'].get('number'))
+        publisher = clean_str(bibjson['journal'].get('publisher'))
+
+        try:
+            release_year = int(bibjson.get('year'))
+        except (TypeError, ValueError):
+            release_year = None
+        release_month = parse_month(clean_str(bibjson.get('month')))
+
+        # block bogus far-future years/dates
+        if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+            release_month = None
+            release_year = None
+
+        license_slug = self.doaj_license_slug(bibjson['journal'].get('license'))
+        country = parse_country_name(bibjson['journal'].get('country'))
+        language = None
+        for raw in bibjson['journal'].get('language') or []:
+            language = parse_lang_name(raw)
+            if language:
+                break
+
+        # pages
+        # NOTE: error in API docs? seems like start_page not under 'journal' object
+        start_page = clean_str(bibjson['journal'].get('start_page')) or clean_str(bibjson.get('start_page'))
+        end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str(bibjson.get('end_page'))
+        pages: Optional[str] = None
+        if start_page and end_page:
+            pages = f"{start_page}-{end_page}"
+        elif start_page:
+            pages = start_page
+
+        doaj_article_id = obj['id'].lower()
+        ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id)
+        abstracts = self.doaj_abstracts(bibjson)
+        contribs = self.doaj_contribs(bibjson.get('author') or [])
+
+        # DOAJ-specific extra
+        doaj_extra = dict()
+        if bibjson.get('subject'):
+            doaj_extra['subject'] = bibjson.get('subject')
+        if bibjson.get('keywords'):
+            doaj_extra['keywords'] = [k for k in [clean_str(s) for s in bibjson.get('keywords')] if k]
+
+        # generic extra
+        extra = dict()
+        if country:
+            extra['country'] = country
+        if not container_id and container_name:
+            extra['container_name'] = container_name
+        if release_year and release_month:
+            # TODO: schema migration
+            extra['release_month'] = release_month
+
+        if doaj_extra:
+            extra['doaj'] = doaj_extra
+        if not extra:
+            extra = None
+
+        re = fatcat_openapi_client.ReleaseEntity(
+            work_id=None,
+            container_id=container_id,
+            release_type='article-journal',
+            release_stage='published',
+            title=title,
+            release_year=release_year,
+            #release_date,
+            publisher=publisher,
+            ext_ids=ext_ids,
+            contribs=contribs,
+            volume=volume,
+            issue=issue,
+            pages=pages,
+            language=language,
+            abstracts=abstracts,
+            extra=extra,
+            license_slug=license_slug,
+        )
+        re = self.biblio_hacks(re)
+        return re
+
+    @staticmethod
+    def biblio_hacks(re):
+        """
+        This function handles known special cases. For example,
+        publisher-specific or platform-specific workarounds.
+        """
+        return re
+
+    def try_update(self, re):
+
+        # lookup existing release by DOAJ article id
+        existing = None
+        try:
+            existing = self.api.lookup_release(doaj=re.ext_ids.doaj)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        # then try other ext_id lookups
+        if not existing:
+            for extid_type in ('doi', 'pmid', 'pmcid'):
+                extid_val = getattr(re.ext_ids, extid_type)
+                if not extid_val:
+                    continue
+                #print(f"  lookup release type: {extid_type} val: {extid_val}")
+                try:
+                    existing = self.api.lookup_release(**{extid_type: extid_val})
+                except fatcat_openapi_client.rest.ApiException as err:
+                    if err.status != 404:
+                        raise err
+                if existing:
+                    if existing.ext_ids.doaj:
+                        warn_str = f"unexpected DOAJ ext_id match after lookup failed doaj={re.ext_ids.doaj} ident={existing.ident}"
+                        warnings.warn(warn_str)
+                        self.counts["skip-doaj-id-mismatch"] += 1
+                        return False
+                    break
+
+        # TODO: in the future could do fuzzy match here, eg using elasticsearch
+
+        # create entity
+        if not existing:
+            return True
+
+        # other logic could go here about skipping updates
+        if not self.do_updates or existing.ext_ids.doaj:
+            self.counts['exists'] += 1
+            return False
+
+        # fields to copy over for update
+        existing.ext_ids.doaj = existing.ext_ids.doaj or re.ext_ids.doaj
+        existing.release_type = existing.release_type or re.release_type
+        existing.release_stage = existing.release_stage or re.release_stage
+        existing.container_id = existing.container_id or re.container_id
+        existing.abstracts = existing.abstracts or re.abstracts
+        existing.extra['doaj'] = re.extra['doaj']
+        existing.volume = existing.volume or re.volume
+        existing.issue = existing.issue or re.issue
+        existing.pages = existing.pages or re.pages
+        existing.language = existing.language or re.language
+
+        try:
+            self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+            self.counts['update'] += 1
+        except fatcat_openapi_client.rest.ApiException as err:
+            # there is a code path where we try to update the same release
+            # twice in a row; if that happens, just skip
+            # NOTE: API behavior might change in the future?
+            if "release_edit_editgroup_id_ident_id_key" in err.body:
+                self.counts['skip-update-conflict'] += 1
+                return False
+            else:
+                raise err
+
+        return False
+
+    def insert_batch(self, batch):
+        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
+            editgroup=fatcat_openapi_client.Editgroup(
+                description=self.editgroup_description,
+                extra=self.editgroup_extra),
+            entity_list=batch))
+
+    def doaj_abstracts(self, bibjson: dict) -> List[fatcat_openapi_client.ReleaseAbstract]:
+        text = clean_str(bibjson.get('abstract'))
+        if not text or len(text) < 10:
+            return []
+        if len(text) > MAX_ABSTRACT_LENGTH:
+            text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
+
+        lang = detect_text_lang(text)
+
+        abstract = fatcat_openapi_client.ReleaseAbstract(
+            mimetype="text/plain",
+            content=text,
+            lang=lang,
+        )
+
+        return [abstract,]
+
+    def doaj_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
+        """
+        bibjson.author {
+            affiliation (string, optional),
+            name (string),
+            orcid_id (string, optional)
+        }
+        """
+        contribs = []
+        index = 0
+        for author in authors:
+            if not author.get('name'):
+                continue
+            creator_id = None
+            orcid = clean_orcid(author.get('orcid_id'))
+            if orcid:
+                creator_id = self.lookup_orcid(orcid)
+            contribs.append(fatcat_openapi_client.ReleaseContrib(
+                raw_name=author.get('name'),
+                role='author',
+                index=index,
+                creator_id=creator_id,
+                raw_affiliation=clean_str(author.get('affiliation')),
+            ))
+            index += 1
+        return contribs
+
+    def doaj_ext_ids(self, identifiers: List[dict], doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds:
+        """
+        bibjson.identifier {
+            id (string),
+            type (string)
+        }
+        """
+
+        assert doaj_article_id.isalnum() and len(doaj_article_id) == 32
+
+        doi: Optional[str] = None
+        pmid: Optional[str] = None
+        pmcid: Optional[str] = None
+        for id_obj in identifiers:
+            if not id_obj.get('id'):
+                continue
+            if id_obj['type'].lower() == 'doi':
+                doi = clean_doi(id_obj['id'])
+            elif id_obj['type'].lower() == 'pmid':
+                pmid = clean_pmid(id_obj['id'])
+            elif id_obj['type'].lower() == 'pmcid':
+                pmcid = clean_pmcid(id_obj['id'])
+
+        return fatcat_openapi_client.ReleaseExtIds(
+            doaj=doaj_article_id,
+            doi=doi,
+            pmid=pmid,
+            pmcid=pmcid,
+        )
+
+    def doaj_license_slug(self, license_list: List[dict]) -> Optional[str]:
+        """
+        bibjson.journal.license {
+            open_access (boolean, optional),
+            title (string, optional),
+            type (string, optional),
+            url (string, optional),
+            version (string, optional)
+        }
+        """
+        if not license_list:
+            return None
+        for license in license_list:
+            if not license.get('open_access'):
+                continue
+            slug = license.get('type')
+            if slug.startswith('CC '):
+                slug = slug.replace('CC ', 'cc-').lower()
+                return slug
+        return None
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 10a90dba..a3d6bccc 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -5,6 +5,13 @@ free-form input, titles, etc.
 """
 
 import re
+import base64
+from typing import Optional
+import unicodedata
+
+import ftfy
+import langdetect
+import pycountry
 
 DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
 
@@ -55,7 +62,8 @@ def clean_doi(raw):
     # will likely want to expand DOI_REGEX to exclude non-ASCII characters, but
     # for now block specific characters so we can get PubMed importer running
     # again.
-    if 'ä' in raw or '\u200e' in raw:
+    # known characters to skip: ä á \u200e \u2043 \u2012
+    if not raw.isascii():
         return None
     return raw
 
@@ -72,6 +80,10 @@ def test_clean_doi():
     assert clean_doi("10.4149/gpb¬_2017042") == None  # "logical negation" character
     assert clean_doi("10.6002/ect.2020.häyry") == None  # this example via pubmed (pmid:32519616)
     assert clean_doi("10.30466/vrf.2019.98547.2350\u200e") == None
+    assert clean_doi("10.12016/j.issn.2096⁃1456.2017.06.014") == None
+    assert clean_doi("10.4025/diálogos.v17i2.36030") == None
+    assert clean_doi("10.19027/jai.10.106‒115") == None
+    assert clean_doi("10.15673/атбп2312-3125.17/2014.26332") == None
 
 
 ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$")
@@ -233,3 +245,321 @@ def test_clean_orcid():
     assert clean_orcid("0123-4567-3456-6789 ") == "0123-4567-3456-6789"
     assert clean_orcid("01234567-3456-6780") == None
     assert clean_orcid("0x23-4567-3456-6780") == None
+
+
+def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]:
+    """
+    This function is appropriate to be called on any random, non-markup string,
+    such as author names, titles, etc.
+
+    It will try to clean up common unicode mangles, HTML characters, etc.
+
+    This will detect XML/HTML and "do the right thing" (aka, not remove
+    entities like '&amp' if there are tags in the string), unless you pass the
+    'force_xml' parameter, which might be appropriate for, eg, names and
+    titles, which generally should be projected down to plain text.
+
+    Also strips extra whitespace.
+    """
+    if not thing:
+        return None
+    fix_entities = 'auto'
+    if force_xml:
+        fix_entities = True
+    fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip()
+    if not fixed or len(fixed) <= 1:
+        # wasn't zero-length before, but is now; return None
+        return None
+    return fixed
+
+def test_clean_str():
+
+    assert clean_str(None) == None
+    assert clean_str('') == None
+    assert clean_str('1') == None
+    assert clean_str('123') == '123'
+    assert clean_str('a&amp;b') == 'a&b'
+    assert clean_str('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
+    assert clean_str('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+
+def b32_hex(s):
+    s = s.strip().split()[0].lower()
+    if s.startswith("sha1:"):
+        s = s[5:]
+    if len(s) != 32:
+        return s
+    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
+def is_cjk(s):
+    if not s:
+        return False
+    for c in s:
+        if c.isalpha():
+            lang_prefix = unicodedata.name(c).split()[0]
+            return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL')
+    return False
+
+def test_is_cjk():
+    assert is_cjk(None) is False
+    assert is_cjk('') is False
+    assert is_cjk('blah') is False
+    assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True
+    assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True
+    assert is_cjk('菊') is True
+    assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True
+    assert is_cjk('水道') is True
+    assert is_cjk('オウ, イク') is True # kanji
+    assert is_cjk('ひヒ') is True
+    assert is_cjk('き゚ゅ') is True
+    assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True
+
+MONTH_MAP = {
+    "jan":  1, "january":   1,
+    "feb":  2, "febuary":   2,
+    "mar":  3, "march":     3,
+    "apr":  4, "april":     4,
+    "may":  5, "may":       5,
+    "jun":  6, "june":      6,
+    "jul":  7, "july":      7,
+    "aug":  8, "august":    8,
+    "sep":  9, "september": 9,
+    "oct": 10, "october":   10,
+    "nov": 11, "nov":       11,
+    "dec": 12, "december":  12,
+}
+
+def parse_month(raw: Optional[str]) -> Optional[int]:
+    """
+    Parses a string into a month number (1 to 12)
+    """
+    if not raw:
+        return None
+    raw = raw.strip().lower()
+    if raw.isdigit():
+        raw_int = int(raw)
+        if raw_int >= 1 and raw_int <= 12:
+            return raw_int
+        else:
+            return None
+    if raw in MONTH_MAP:
+        return MONTH_MAP[raw]
+    return None
+
+def test_parse_month() -> None:
+
+    assert parse_month(None) == None
+    assert parse_month("") == None
+    assert parse_month("0") == None
+    assert parse_month("10") == 10
+    assert parse_month("jan") == 1
+    assert parse_month("September") == 9
+
+def detect_text_lang(raw: str) -> Optional[str]:
+    """
+    Tries to determine language of, eg, an abstract.
+
+    Returns an ISO 631 2-char language code, or None.
+    """
+    if not raw:
+        return None
+    try:
+        lang = langdetect.detect(raw)
+        lang = lang.split('-')[0]
+        assert len(lang) == 2
+        return lang
+    except (langdetect.lang_detect_exception.LangDetectException, TypeError):
+        return None
+    return None
+
+def test_detect_text_lang() -> None:
+    assert detect_text_lang("") == None
+    EN_SAMPLE = "this is a string of English text for testing"
+    assert detect_text_lang(EN_SAMPLE) == "en"
+    JA_SAMPLE = "モーラの種類は、以下に示すように111程度存在する。ただし、研究者により数え方が少しずつ異なる。"
+    assert detect_text_lang(JA_SAMPLE) == "ja"
+    ZH_SAMPLE = "随着分布式清洁能源的普及,通信技术在协调各个分布式电源的控制中显得尤为重要。在电力信息传输的过程中,不同的网络状态下表现出不同的通信特性,严重的甚至会发生信息错乱丢包等行为,这对电网的实时控制产生严重影响。为研究信息系统对电力物理系统的实时影响,搭建了电力信息物理融合仿真平台,运用RT-LAB与OPNET两款实时仿真器,通过TCP/IP进行数据交互,对微电网电压、频率的集中式恢复与分布式恢复问题展开研究。仿真结果表明,该平台能有效地反映通信网络对电网控制的影响,提供了一种可靠的未来电力信息物理融合系统研究技术。随着分布式清洁能源的普及,通信技术在协调各个分布式电源的控制中显得尤为重要。在电力信息传输的过程中,不同的网络状态下表现出不同的通信特性,严重的甚至会发生信息错乱丢包等行为,这对电网的实时控制产生严重影响。为研究信息系统对电力物理系统的实时影响,搭建了电力信息物理融合仿真平台,运用RT-LAB与OPNET两款实时仿真器,通过TCP/IP进行数据交互,对微电网电压、频率的集中式恢复与分布式恢复问题展开研究。仿真结果表明,该平台能有效地反映通信网络对电网控制的影响,提供了一种可靠的未来电力信息物理融合系统研究技术。"
+    assert detect_text_lang(ZH_SAMPLE) == "zh"
+
+def parse_lang_name(raw: Optional[str]) -> Optional[str]:
+    """
+    Parses a language name and returns a 2-char ISO 631 language code.
+    """
+    if not raw:
+        return None
+    try:
+        lang = pycountry.languages.lookup(raw)
+        if lang.alpha_3 in ("mul", "mis"):
+            return None
+        return lang.alpha_2.lower()
+    except LookupError:
+        #print(f"  unknown language: '{raw}', file=sys.stderr)
+        return None
+    except AttributeError:
+        #print(f"  partial language metadata: '{lang}', file=sys.stderr)
+        return None
+    return None
+
+def test_parse_lang_name() -> None:
+
+    assert parse_lang_name(None) == None
+    assert parse_lang_name("") == None
+    assert parse_lang_name("asdf ") == None
+    assert parse_lang_name("english") == "en"
+    assert parse_lang_name("ENGLISH") == "en"
+    assert parse_lang_name("asdf blah") is None
+    assert parse_lang_name("en") == "en"
+    assert parse_lang_name("EN") == "en"
+    assert parse_lang_name("ENG") == "en"
+    assert parse_lang_name("English") == "en"
+    assert parse_lang_name("Portuguese") == "pt"
+
+
+def parse_country_name(s: Optional[str]) -> Optional[str]:
+    """
+    Parses a country name into a ISO country code (2-char).
+
+    This version copied from the chocula repository.
+    """
+    if not s or s in ("Unknown"):
+        return None
+
+    s = s.strip()
+    if s.lower() in ("usa", "new york (state)", "washington (state)"):
+        return "us"
+    if s.lower() in ("russia (federation)", "russia"):
+        return "ru"
+    if s == "Québec (Province)":
+        s = "Canada"
+    if s == "China (Republic : 1949- )":
+        return "tw"
+    if s == "Brunei":
+        return "bn"
+    if s.startswith("Congo "):
+        s = "Congo"
+    if s.lower() == "iran":
+        return "ir"
+    if s.lower() == "bermuda islands":
+        return "bm"
+    if s.lower() == "burma":
+        s = "myanmar"
+    if s.lower() in ("korea (south)", "south korea"):
+        return "kr"
+    if s.lower() in ("england", "scotland", "wales"):
+        return "uk"
+    s = s.replace(" (Republic)", "").replace(" (Federation)", "")
+
+    try:
+        country = pycountry.countries.lookup(s)
+    except LookupError:
+        country = None
+
+    if country:
+        return country.alpha_2.lower()
+    try:
+        sub = pycountry.subdivisions.lookup(s)
+    except LookupError:
+        sub = None
+
+    s = s.replace(" (State)", "").replace(" (Province)", "")
+    if sub:
+        return sub.country_code.lower()
+
+    else:
+        # print(f"unknown country: {s}", file=sys.stderr)
+        return None
+
+
+def test_parse_country_name():
+    assert parse_country_name("") is None
+    assert parse_country_name("asdf blah") is None
+    assert parse_country_name("us") == "us"
+    assert parse_country_name("USA") == "us"
+    assert parse_country_name("United States of America") == "us"
+    assert parse_country_name("united States") == "us"
+    assert parse_country_name("Massachusetts") == "us"
+    assert parse_country_name("Russia") == "ru"
+    assert parse_country_name("Japan") == "jp"
+
+# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
+# 2/T and 2/B?
+# PubMed/MEDLINE and JSTOR use these MARC codes
+# https://www.loc.gov/marc/languages/language_name.html
+LANG_MAP_MARC = {
+    'afr': 'af',
+    'alb': 'sq',
+    'amh': 'am',
+    'ara': 'ar',
+    'arm': 'hy',
+    'aze': 'az',
+    'ben': 'bn',
+    'bos': 'bs',
+    'bul': 'bg',
+    'cat': 'ca',
+    'chi': 'zh',
+    'cze': 'cs',
+    'dan': 'da',
+    'dut': 'nl',
+    'eng': 'en',
+    'epo': 'eo',
+    'est': 'et',
+    'fin': 'fi',
+    'fre': 'fr',
+    'geo': 'ka',
+    'ger': 'de',
+    'gla': 'gd',
+    'gre': 'el',
+    'heb': 'he',
+    'hin': 'hi',
+    'hrv': 'hr',
+    'hun': 'hu',
+    'ice': 'is',
+    'ind': 'id',
+    'ita': 'it',
+    'jpn': 'ja',
+    'kin': 'rw',
+    'kor': 'ko',
+    'lat': 'la',
+    'lav': 'lv',
+    'lit': 'lt',
+    'mac': 'mk',
+    'mal': 'ml',
+    'mao': 'mi',
+    'may': 'ms',
+    'nor': 'no',
+    'per': 'fa',
+    'per': 'fa',
+    'pol': 'pl',
+    'por': 'pt',
+    'pus': 'ps',
+    'rum': 'ro',
+    'rus': 'ru',
+    'san': 'sa',
+    'slo': 'sk',
+    'slv': 'sl',
+    'spa': 'es',
+    'srp': 'sr',
+    'swe': 'sv',
+    'tha': 'th',
+    'tur': 'tr',
+    'ukr': 'uk',
+    'urd': 'ur',
+    'vie': 'vi',
+    'wel': 'cy',
+
+# additions
+    'gle': 'ga', # "Irish" (Gaelic)
+    'jav': 'jv', # Javanese
+    'welsh': 'cy', # Welsh
+    'oci': 'oc', # Occitan
+
+# Don't have ISO 639-1 codes
+    'grc': 'el', # Ancient Greek; map to modern greek
+    'map': None, # Austronesian (collection)
+    'syr': None, # Syriac, Modern
+    'gem': None, # Old Saxon
+    'non': None, # Old Norse
+    'emg': None, # Eastern Meohang
+    'neg': None, # Negidal
+    'mul': None, # Multiple languages
+    'und': None, # Undetermined
+}
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index dfb5f799..96a5b96b 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -213,6 +213,9 @@ def release_to_elasticsearch(entity, force_bool=True):
     t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
     t['ia_pdf_url'] = ia_pdf_url
 
+    if release.ext_ids.doaj:
+        is_oa = True
+
     if release.license_slug:
         # TODO: more/better checks here, particularly strict *not* OA licenses
         if release.license_slug.startswith("CC-"):
author	Martin Czygan <martin@archive.org>	2020-11-24 19:29:07 +0000
committer	Martin Czygan <martin@archive.org>	2020-11-24 19:29:07 +0000
commit	cfd13852d7cb58fcc3387373960adaf3680f0faf (patch)
tree	675954b8b34324fe22fc5a00f3fbb99a21a77a21 /python/fatcat_tools
parent	fcfcd3224a113fa90da2045a3c7fe90127088ebe (diff)
parent	1fca5a9822944d0646d2dcba6cf54f27a0ffe5c0 (diff)
download	fatcat-cfd13852d7cb58fcc3387373960adaf3680f0faf.tar.gz fatcat-cfd13852d7cb58fcc3387373960adaf3680f0faf.zip