diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-16 13:53:15 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 | 
| commit | 6ff79f47c7c7ae27b28685674672e58b7dd4d271 (patch) | |
| tree | 2470f89de864207da8ccc92151cb35d5e20ba21b | |
| parent | 300665927f578151321b0d91b28f8aadffcf227d (diff) | |
| download | fatcat-6ff79f47c7c7ae27b28685674672e58b7dd4d271.tar.gz fatcat-6ff79f47c7c7ae27b28685674672e58b7dd4d271.zip | |
tweaks to new imports/tests
| -rw-r--r-- | python/fatcat_tools/importers/arxiv.py | 7 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 83 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/jalc.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/jstor.py | 20 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 10 | ||||
| -rw-r--r-- | python/tests/import_arxiv.py | 2 | ||||
| -rw-r--r-- | python/tests/import_pubmed.py | 4 | 
7 files changed, 97 insertions, 31 deletions
| diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index c53e47f1..0d0179cd 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -103,7 +103,8 @@ class ArxivRawImporter(EntityImporter):                  lang = 'ru'              # more languages? -        release_type = "article-journal" +        # don't know! +        release_type = "article"          if metadata.find('journal-ref') and metadata.find('journal-ref').string:              journal_ref = metadata.find('journal-ref').string.strip() @@ -166,7 +167,7 @@ class ArxivRawImporter(EntityImporter):                  title=title,                  #original_title                  version=version['version'], -                release_type="article-journal", +                release_type="article",                  release_stage='submitted',                  release_date=release_date.isoformat(),                  release_year=release_date.year, @@ -294,5 +295,5 @@ class ArxivRawImporter(EntityImporter):              #sys.exit(-1)  if __name__=='__main__': -    parser = ArxivRawImporter() +    parser = ArxivRawImporter(None)      parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 7fca38cf..e37d57ec 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -20,6 +20,75 @@ DATE_FMT = "%Y-%m-%d"  SANE_MAX_RELEASES = 200  SANE_MAX_URLS = 100 +# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of +# 2/T and 2/B? +# PubMed/MEDLINE and JSTOR use these MARC codes +LANG_MAP_MARC = { +    'afr': 'af', +    'alb': 'sq', +    'amh': 'am', +    'ara': 'ar', +    'arm': 'hy', +    'aze': 'az', +    'ben': 'bn', +    'bos': 'bs', +    'bul': 'bg', +    'cat': 'ca', +    'chi': 'zh', +    'cze': 'cs', +    'dan': 'da', +    'dut': 'nl', +    'eng': 'en', +    'epo': 'eo', +    'est': 'et', +    'fin': 'fi', +    'fre': 'fr', +    'geo': 'ka', +    'ger': 'de', +    'gla': 'gd', +    'gre': 'el', +    'heb': 'he', +    'hin': 'hi', +    'hrv': 'hr', +    'hun': 'hu', +    'ice': 'is', +    'ind': 'id', +    'ita': 'it', +    'jpn': 'ja', +    'kin': 'rw', +    'kor': 'ko', +    'lat': 'la', +    'lav': 'lv', +    'lit': 'lt', +    'mac': 'mk', +    'mal': 'ml', +    'mao': 'mi', +    'may': 'ms', +#    mul, Multiple languages +    'nor': 'no', +    'per': 'fa', +    'per': 'fa', +    'pol': 'pl', +    'por': 'pt', +    'pus': 'ps', +    'rum': 'ro', +    'rus': 'ru', +    'san': 'sa', +    'slo': 'sk', +    'slv': 'sl', +    'spa': 'es', +    'srp': 'sr', +    'swe': 'sv', +    'tha': 'th', +    'tur': 'tr', +    'ukr': 'uk', +#    und, Undetermined +    'urd': 'ur', +    'vie': 'vi', +    'wel': 'cy', +} + +  def clean(thing, force_xml=False):      """      This function is appropriate to be called on any random, non-markup string, @@ -58,19 +127,23 @@ def test_clean():  def is_cjk(s):      if not s:          return False -    return unicodedata.name(s[0]).startswith("CJK") +    for c in s: +        if c.isalpha(): +            return unicodedata.name(c).startswith("CJK") +    return False  def test_is_cjk():      assert is_cjk(None) == False      assert is_cjk('') == False      assert is_cjk('blah') == False      assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True +    assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') == True      assert is_cjk('菊') == True -    assert is_cjk('ひヒ') == True -    assert is_cjk('english with ひヒ') == True -    assert is_cjk('き゚ゅ') == True +    assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') == True      assert is_cjk('水道') == True -    assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True +    # TODO: assert is_cjk('ひヒ') == True +    # TODO: assert is_cjk('き゚ゅ') == True +    # TODO: assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True  DOMAIN_REL_MAP = {      "archive.org": "archive", diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index f1386bff..09b8bd76 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -301,5 +301,5 @@ class JalcImporter(EntityImporter):  if __name__=='__main__': -    parser = JalcXmlParser() +    parser = JalcImporter(None, None)      parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 9bf4a043..fd1decf7 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -1,20 +1,12 @@  import sys  import json -import sqlite3  import datetime -import itertools -import subprocess +import warnings  from bs4 import BeautifulSoup  import fatcat_client -from .common import EntityImporter, clean - -# is this just ISO 3-char to ISO 2-char? -# XXX: more entries -JSTOR_LANG_MAP = { -    'eng': 'en', -} +from .common import EntityImporter, clean, LANG_MAP_MARC  # XXX: more entries  JSTOR_CONTRIB_MAP = { @@ -136,7 +128,9 @@ class JstorImporter(EntityImporter):          cm = article_meta.find("custom-meta")          if cm.find("meta-name").string == "lang":              language = cm.find("meta-value").string -            language = JSTOR_LANG_MAP.get(language) +            language = LANG_MAP_MARC.get(language) +            if not language: +                warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))          release_type = "article-journal"          if "[Abstract]" in title: @@ -238,7 +232,7 @@ class JstorImporter(EntityImporter):              return False          elif existing:              # but do update if only DOI was set -            existing.ext_ids.jstor = re.jstor_id +            existing.ext_ids.jstor = re.ext_ids.jstor              existing.extra['jstor'] = re.extra['jstor']              self.api.update_release(self.get_editgroup_id(), existing.ident, existing)              self.counts['update'] += 1 @@ -265,5 +259,5 @@ class JstorImporter(EntityImporter):              #sys.exit(-1)  if __name__=='__main__': -    parser = JstorImporter() +    parser = JstorImporter(None, None)      parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 1feb41cd..f83922a3 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -115,9 +115,6 @@ class PubmedImporter(EntityImporter):      XXX: full author names      """ -    def __init__(self): -        pass -      def __init__(self, api, issn_map_file, **kwargs):          eg_desc = kwargs.get('editgroup_description', @@ -181,7 +178,8 @@ class PubmedImporter(EntityImporter):          pmcid = identifiers.find("ArticleId", IdType="pmc")          if pmcid: -            pmcid = pmcid.string +            # XXX: strip the version part? or retain? +            pmcid = pmcid.string.split('.')[0]          release_type = None          pub_types = [] @@ -471,7 +469,7 @@ class PubmedImporter(EntityImporter):                  self.counts['exists-pmid-doi-mismatch'] += 1                  return False -        if existing and existing.ext_ids.pmid and existing.refs: +        if existing and existing.ext_ids.pmid and (existing.refs or not re.refs):              # TODO: any other reasons to do an update?              # don't update if it already has PMID              self.counts['exists'] += 1 @@ -508,5 +506,5 @@ class PubmedImporter(EntityImporter):              #sys.exit(-1)  if __name__=='__main__': -    parser = PubMedParser() +    parser = PubmedImporter(None, None)      parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/import_arxiv.py b/python/tests/import_arxiv.py index 726bafc5..8d91be10 100644 --- a/python/tests/import_arxiv.py +++ b/python/tests/import_arxiv.py @@ -52,7 +52,7 @@ def test_arxiv_xml_parse(arxiv_importer):      assert r1.title == "Martingale theory for housekeeping heat"      assert r1.subtitle == None      assert r1.original_title == None -    assert r1.release_type == "article-journal" +    assert r1.release_type == "article"      assert r1.release_stage == "submitted"      assert r2.release_stage == "published"      assert r1.license_slug == "ARXIV-NED-1.0" diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py index eacc3815..05a77599 100644 --- a/python/tests/import_pubmed.py +++ b/python/tests/import_pubmed.py @@ -21,7 +21,7 @@ def test_pubmed_importer(pubmed_importer):      with open('tests/files/pubmedsample_2019.xml', 'r') as f:          pubmed_importer.bezerk_mode = True          counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run() -    assert counts['insert'] == 1 +    assert counts['insert'] == 176      assert counts['exists'] == 0      assert counts['skip'] == 0 @@ -39,7 +39,7 @@ def test_pubmed_importer(pubmed_importer):          pubmed_importer.reset()          counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run()      assert counts['insert'] == 0 -    assert counts['exists'] == 1 +    assert counts['exists'] == 176      assert counts['skip'] == 0      assert last_index == pubmed_importer.api.get_changelog(limit=1)[0].index | 
