From 6ff79f47c7c7ae27b28685674672e58b7dd4d271 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 16 May 2019 13:53:15 -0700 Subject: tweaks to new imports/tests --- python/fatcat_tools/importers/arxiv.py | 7 +-- python/fatcat_tools/importers/common.py | 83 +++++++++++++++++++++++++++++++-- python/fatcat_tools/importers/jalc.py | 2 +- python/fatcat_tools/importers/jstor.py | 20 +++----- python/fatcat_tools/importers/pubmed.py | 10 ++-- python/tests/import_arxiv.py | 2 +- python/tests/import_pubmed.py | 4 +- 7 files changed, 97 insertions(+), 31 deletions(-) diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index c53e47f1..0d0179cd 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -103,7 +103,8 @@ class ArxivRawImporter(EntityImporter): lang = 'ru' # more languages? - release_type = "article-journal" + # don't know! + release_type = "article" if metadata.find('journal-ref') and metadata.find('journal-ref').string: journal_ref = metadata.find('journal-ref').string.strip() @@ -166,7 +167,7 @@ class ArxivRawImporter(EntityImporter): title=title, #original_title version=version['version'], - release_type="article-journal", + release_type="article", release_stage='submitted', release_date=release_date.isoformat(), release_year=release_date.year, @@ -294,5 +295,5 @@ class ArxivRawImporter(EntityImporter): #sys.exit(-1) if __name__=='__main__': - parser = ArxivRawImporter() + parser = ArxivRawImporter(None) parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 7fca38cf..e37d57ec 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -20,6 +20,75 @@ DATE_FMT = "%Y-%m-%d" SANE_MAX_RELEASES = 200 SANE_MAX_URLS = 100 +# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of +# 2/T and 2/B? +# PubMed/MEDLINE and JSTOR use these MARC codes +LANG_MAP_MARC = { + 'afr': 'af', + 'alb': 'sq', + 'amh': 'am', + 'ara': 'ar', + 'arm': 'hy', + 'aze': 'az', + 'ben': 'bn', + 'bos': 'bs', + 'bul': 'bg', + 'cat': 'ca', + 'chi': 'zh', + 'cze': 'cs', + 'dan': 'da', + 'dut': 'nl', + 'eng': 'en', + 'epo': 'eo', + 'est': 'et', + 'fin': 'fi', + 'fre': 'fr', + 'geo': 'ka', + 'ger': 'de', + 'gla': 'gd', + 'gre': 'el', + 'heb': 'he', + 'hin': 'hi', + 'hrv': 'hr', + 'hun': 'hu', + 'ice': 'is', + 'ind': 'id', + 'ita': 'it', + 'jpn': 'ja', + 'kin': 'rw', + 'kor': 'ko', + 'lat': 'la', + 'lav': 'lv', + 'lit': 'lt', + 'mac': 'mk', + 'mal': 'ml', + 'mao': 'mi', + 'may': 'ms', +# mul, Multiple languages + 'nor': 'no', + 'per': 'fa', + 'per': 'fa', + 'pol': 'pl', + 'por': 'pt', + 'pus': 'ps', + 'rum': 'ro', + 'rus': 'ru', + 'san': 'sa', + 'slo': 'sk', + 'slv': 'sl', + 'spa': 'es', + 'srp': 'sr', + 'swe': 'sv', + 'tha': 'th', + 'tur': 'tr', + 'ukr': 'uk', +# und, Undetermined + 'urd': 'ur', + 'vie': 'vi', + 'wel': 'cy', +} + + def clean(thing, force_xml=False): """ This function is appropriate to be called on any random, non-markup string, @@ -58,19 +127,23 @@ def test_clean(): def is_cjk(s): if not s: return False - return unicodedata.name(s[0]).startswith("CJK") + for c in s: + if c.isalpha(): + return unicodedata.name(c).startswith("CJK") + return False def test_is_cjk(): assert is_cjk(None) == False assert is_cjk('') == False assert is_cjk('blah') == False assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True + assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') == True assert is_cjk('菊') == True - assert is_cjk('ひヒ') == True - assert is_cjk('english with ひヒ') == True - assert is_cjk('き゚ゅ') == True + assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') == True assert is_cjk('水道') == True - assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True + # TODO: assert is_cjk('ひヒ') == True + # TODO: assert is_cjk('き゚ゅ') == True + # TODO: assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True DOMAIN_REL_MAP = { "archive.org": "archive", diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index f1386bff..09b8bd76 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -301,5 +301,5 @@ class JalcImporter(EntityImporter): if __name__=='__main__': - parser = JalcXmlParser() + parser = JalcImporter(None, None) parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 9bf4a043..fd1decf7 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -1,20 +1,12 @@ import sys import json -import sqlite3 import datetime -import itertools -import subprocess +import warnings from bs4 import BeautifulSoup import fatcat_client -from .common import EntityImporter, clean - -# is this just ISO 3-char to ISO 2-char? -# XXX: more entries -JSTOR_LANG_MAP = { - 'eng': 'en', -} +from .common import EntityImporter, clean, LANG_MAP_MARC # XXX: more entries JSTOR_CONTRIB_MAP = { @@ -136,7 +128,9 @@ class JstorImporter(EntityImporter): cm = article_meta.find("custom-meta") if cm.find("meta-name").string == "lang": language = cm.find("meta-value").string - language = JSTOR_LANG_MAP.get(language) + language = LANG_MAP_MARC.get(language) + if not language: + warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string)) release_type = "article-journal" if "[Abstract]" in title: @@ -238,7 +232,7 @@ class JstorImporter(EntityImporter): return False elif existing: # but do update if only DOI was set - existing.ext_ids.jstor = re.jstor_id + existing.ext_ids.jstor = re.ext_ids.jstor existing.extra['jstor'] = re.extra['jstor'] self.api.update_release(self.get_editgroup_id(), existing.ident, existing) self.counts['update'] += 1 @@ -265,5 +259,5 @@ class JstorImporter(EntityImporter): #sys.exit(-1) if __name__=='__main__': - parser = JstorImporter() + parser = JstorImporter(None, None) parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 1feb41cd..f83922a3 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -115,9 +115,6 @@ class PubmedImporter(EntityImporter): XXX: full author names """ - def __init__(self): - pass - def __init__(self, api, issn_map_file, **kwargs): eg_desc = kwargs.get('editgroup_description', @@ -181,7 +178,8 @@ class PubmedImporter(EntityImporter): pmcid = identifiers.find("ArticleId", IdType="pmc") if pmcid: - pmcid = pmcid.string + # XXX: strip the version part? or retain? + pmcid = pmcid.string.split('.')[0] release_type = None pub_types = [] @@ -471,7 +469,7 @@ class PubmedImporter(EntityImporter): self.counts['exists-pmid-doi-mismatch'] += 1 return False - if existing and existing.ext_ids.pmid and existing.refs: + if existing and existing.ext_ids.pmid and (existing.refs or not re.refs): # TODO: any other reasons to do an update? # don't update if it already has PMID self.counts['exists'] += 1 @@ -508,5 +506,5 @@ class PubmedImporter(EntityImporter): #sys.exit(-1) if __name__=='__main__': - parser = PubMedParser() + parser = PubmedImporter(None, None) parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/import_arxiv.py b/python/tests/import_arxiv.py index 726bafc5..8d91be10 100644 --- a/python/tests/import_arxiv.py +++ b/python/tests/import_arxiv.py @@ -52,7 +52,7 @@ def test_arxiv_xml_parse(arxiv_importer): assert r1.title == "Martingale theory for housekeeping heat" assert r1.subtitle == None assert r1.original_title == None - assert r1.release_type == "article-journal" + assert r1.release_type == "article" assert r1.release_stage == "submitted" assert r2.release_stage == "published" assert r1.license_slug == "ARXIV-NED-1.0" diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py index eacc3815..05a77599 100644 --- a/python/tests/import_pubmed.py +++ b/python/tests/import_pubmed.py @@ -21,7 +21,7 @@ def test_pubmed_importer(pubmed_importer): with open('tests/files/pubmedsample_2019.xml', 'r') as f: pubmed_importer.bezerk_mode = True counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run() - assert counts['insert'] == 1 + assert counts['insert'] == 176 assert counts['exists'] == 0 assert counts['skip'] == 0 @@ -39,7 +39,7 @@ def test_pubmed_importer(pubmed_importer): pubmed_importer.reset() counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run() assert counts['insert'] == 0 - assert counts['exists'] == 1 + assert counts['exists'] == 176 assert counts['skip'] == 0 assert last_index == pubmed_importer.api.get_changelog(limit=1)[0].index -- cgit v1.2.3