aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/fatcat_tools/importers/arxiv.py7
-rw-r--r--python/fatcat_tools/importers/common.py83
-rw-r--r--python/fatcat_tools/importers/jalc.py2
-rw-r--r--python/fatcat_tools/importers/jstor.py20
-rw-r--r--python/fatcat_tools/importers/pubmed.py10
-rw-r--r--python/tests/import_arxiv.py2
-rw-r--r--python/tests/import_pubmed.py4
7 files changed, 97 insertions, 31 deletions
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index c53e47f1..0d0179cd 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -103,7 +103,8 @@ class ArxivRawImporter(EntityImporter):
lang = 'ru'
# more languages?
- release_type = "article-journal"
+ # don't know!
+ release_type = "article"
if metadata.find('journal-ref') and metadata.find('journal-ref').string:
journal_ref = metadata.find('journal-ref').string.strip()
@@ -166,7 +167,7 @@ class ArxivRawImporter(EntityImporter):
title=title,
#original_title
version=version['version'],
- release_type="article-journal",
+ release_type="article",
release_stage='submitted',
release_date=release_date.isoformat(),
release_year=release_date.year,
@@ -294,5 +295,5 @@ class ArxivRawImporter(EntityImporter):
#sys.exit(-1)
if __name__=='__main__':
- parser = ArxivRawImporter()
+ parser = ArxivRawImporter(None)
parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 7fca38cf..e37d57ec 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -20,6 +20,75 @@ DATE_FMT = "%Y-%m-%d"
SANE_MAX_RELEASES = 200
SANE_MAX_URLS = 100
+# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
+# 2/T and 2/B?
+# PubMed/MEDLINE and JSTOR use these MARC codes
+LANG_MAP_MARC = {
+ 'afr': 'af',
+ 'alb': 'sq',
+ 'amh': 'am',
+ 'ara': 'ar',
+ 'arm': 'hy',
+ 'aze': 'az',
+ 'ben': 'bn',
+ 'bos': 'bs',
+ 'bul': 'bg',
+ 'cat': 'ca',
+ 'chi': 'zh',
+ 'cze': 'cs',
+ 'dan': 'da',
+ 'dut': 'nl',
+ 'eng': 'en',
+ 'epo': 'eo',
+ 'est': 'et',
+ 'fin': 'fi',
+ 'fre': 'fr',
+ 'geo': 'ka',
+ 'ger': 'de',
+ 'gla': 'gd',
+ 'gre': 'el',
+ 'heb': 'he',
+ 'hin': 'hi',
+ 'hrv': 'hr',
+ 'hun': 'hu',
+ 'ice': 'is',
+ 'ind': 'id',
+ 'ita': 'it',
+ 'jpn': 'ja',
+ 'kin': 'rw',
+ 'kor': 'ko',
+ 'lat': 'la',
+ 'lav': 'lv',
+ 'lit': 'lt',
+ 'mac': 'mk',
+ 'mal': 'ml',
+ 'mao': 'mi',
+ 'may': 'ms',
+# mul, Multiple languages
+ 'nor': 'no',
+ 'per': 'fa',
+ 'per': 'fa',
+ 'pol': 'pl',
+ 'por': 'pt',
+ 'pus': 'ps',
+ 'rum': 'ro',
+ 'rus': 'ru',
+ 'san': 'sa',
+ 'slo': 'sk',
+ 'slv': 'sl',
+ 'spa': 'es',
+ 'srp': 'sr',
+ 'swe': 'sv',
+ 'tha': 'th',
+ 'tur': 'tr',
+ 'ukr': 'uk',
+# und, Undetermined
+ 'urd': 'ur',
+ 'vie': 'vi',
+ 'wel': 'cy',
+}
+
+
def clean(thing, force_xml=False):
"""
This function is appropriate to be called on any random, non-markup string,
@@ -58,19 +127,23 @@ def test_clean():
def is_cjk(s):
if not s:
return False
- return unicodedata.name(s[0]).startswith("CJK")
+ for c in s:
+ if c.isalpha():
+ return unicodedata.name(c).startswith("CJK")
+ return False
def test_is_cjk():
assert is_cjk(None) == False
assert is_cjk('') == False
assert is_cjk('blah') == False
assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True
+ assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') == True
assert is_cjk('菊') == True
- assert is_cjk('ひヒ') == True
- assert is_cjk('english with ひヒ') == True
- assert is_cjk('き゚ゅ') == True
+ assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') == True
assert is_cjk('水道') == True
- assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True
+ # TODO: assert is_cjk('ひヒ') == True
+ # TODO: assert is_cjk('き゚ゅ') == True
+ # TODO: assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True
DOMAIN_REL_MAP = {
"archive.org": "archive",
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index f1386bff..09b8bd76 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -301,5 +301,5 @@ class JalcImporter(EntityImporter):
if __name__=='__main__':
- parser = JalcXmlParser()
+ parser = JalcImporter(None, None)
parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index 9bf4a043..fd1decf7 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -1,20 +1,12 @@
import sys
import json
-import sqlite3
import datetime
-import itertools
-import subprocess
+import warnings
from bs4 import BeautifulSoup
import fatcat_client
-from .common import EntityImporter, clean
-
-# is this just ISO 3-char to ISO 2-char?
-# XXX: more entries
-JSTOR_LANG_MAP = {
- 'eng': 'en',
-}
+from .common import EntityImporter, clean, LANG_MAP_MARC
# XXX: more entries
JSTOR_CONTRIB_MAP = {
@@ -136,7 +128,9 @@ class JstorImporter(EntityImporter):
cm = article_meta.find("custom-meta")
if cm.find("meta-name").string == "lang":
language = cm.find("meta-value").string
- language = JSTOR_LANG_MAP.get(language)
+ language = LANG_MAP_MARC.get(language)
+ if not language:
+ warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))
release_type = "article-journal"
if "[Abstract]" in title:
@@ -238,7 +232,7 @@ class JstorImporter(EntityImporter):
return False
elif existing:
# but do update if only DOI was set
- existing.ext_ids.jstor = re.jstor_id
+ existing.ext_ids.jstor = re.ext_ids.jstor
existing.extra['jstor'] = re.extra['jstor']
self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
self.counts['update'] += 1
@@ -265,5 +259,5 @@ class JstorImporter(EntityImporter):
#sys.exit(-1)
if __name__=='__main__':
- parser = JstorImporter()
+ parser = JstorImporter(None, None)
parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 1feb41cd..f83922a3 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -115,9 +115,6 @@ class PubmedImporter(EntityImporter):
XXX: full author names
"""
- def __init__(self):
- pass
-
def __init__(self, api, issn_map_file, **kwargs):
eg_desc = kwargs.get('editgroup_description',
@@ -181,7 +178,8 @@ class PubmedImporter(EntityImporter):
pmcid = identifiers.find("ArticleId", IdType="pmc")
if pmcid:
- pmcid = pmcid.string
+ # XXX: strip the version part? or retain?
+ pmcid = pmcid.string.split('.')[0]
release_type = None
pub_types = []
@@ -471,7 +469,7 @@ class PubmedImporter(EntityImporter):
self.counts['exists-pmid-doi-mismatch'] += 1
return False
- if existing and existing.ext_ids.pmid and existing.refs:
+ if existing and existing.ext_ids.pmid and (existing.refs or not re.refs):
# TODO: any other reasons to do an update?
# don't update if it already has PMID
self.counts['exists'] += 1
@@ -508,5 +506,5 @@ class PubmedImporter(EntityImporter):
#sys.exit(-1)
if __name__=='__main__':
- parser = PubMedParser()
+ parser = PubmedImporter(None, None)
parser.parse_file(open(sys.argv[1]))
diff --git a/python/tests/import_arxiv.py b/python/tests/import_arxiv.py
index 726bafc5..8d91be10 100644
--- a/python/tests/import_arxiv.py
+++ b/python/tests/import_arxiv.py
@@ -52,7 +52,7 @@ def test_arxiv_xml_parse(arxiv_importer):
assert r1.title == "Martingale theory for housekeeping heat"
assert r1.subtitle == None
assert r1.original_title == None
- assert r1.release_type == "article-journal"
+ assert r1.release_type == "article"
assert r1.release_stage == "submitted"
assert r2.release_stage == "published"
assert r1.license_slug == "ARXIV-NED-1.0"
diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py
index eacc3815..05a77599 100644
--- a/python/tests/import_pubmed.py
+++ b/python/tests/import_pubmed.py
@@ -21,7 +21,7 @@ def test_pubmed_importer(pubmed_importer):
with open('tests/files/pubmedsample_2019.xml', 'r') as f:
pubmed_importer.bezerk_mode = True
counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run()
- assert counts['insert'] == 1
+ assert counts['insert'] == 176
assert counts['exists'] == 0
assert counts['skip'] == 0
@@ -39,7 +39,7 @@ def test_pubmed_importer(pubmed_importer):
pubmed_importer.reset()
counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run()
assert counts['insert'] == 0
- assert counts['exists'] == 1
+ assert counts['exists'] == 176
assert counts['skip'] == 0
assert last_index == pubmed_importer.api.get_changelog(limit=1)[0].index