summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/common.py17
-rw-r--r--python/fatcat_tools/importers/pubmed.py2
2 files changed, 15 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 690c7902..dee60947 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -23,6 +23,7 @@ SANE_MAX_URLS = 100
# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
# 2/T and 2/B?
# PubMed/MEDLINE and JSTOR use these MARC codes
+# https://www.loc.gov/marc/languages/language_name.html
LANG_MAP_MARC = {
'afr': 'af',
'alb': 'sq',
@@ -64,7 +65,6 @@ LANG_MAP_MARC = {
'mal': 'ml',
'mao': 'mi',
'may': 'ms',
-# mul, Multiple languages
'nor': 'no',
'per': 'fa',
'per': 'fa',
@@ -82,7 +82,6 @@ LANG_MAP_MARC = {
'tha': 'th',
'tur': 'tr',
'ukr': 'uk',
-# und, Undetermined
'urd': 'ur',
'vie': 'vi',
'wel': 'cy',
@@ -90,7 +89,19 @@ LANG_MAP_MARC = {
# additions
'gle': 'ga', # "Irish" (Gaelic)
'jav': 'jv', # Javanese
- # 'map', for Austronesian languages, has no ISO 639-1 code
+ 'welsh': 'cy', # Welsh
+ 'oci': 'oc', # Occitan
+
+# Don't have ISO 639-1 codes
+ 'grc': 'el', # Ancient Greek; map to modern greek
+ 'map': None, # Austronesian (collection)
+ 'syr': None, # Syriac, Modern
+ 'gem': None, # Old Saxon
+ 'non': None, # Old Norse
+ 'emg': None, # Eastern Meohang
+ 'neg': None, # Negidal
+ 'mul': None, # Multiple languages
+ 'und': None, # Undetermined
}
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index d31e2f60..1246bf6b 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -448,7 +448,7 @@ class PubmedImporter(EntityImporter):
language = None
else:
language = LANG_MAP_MARC.get(language)
- if not language:
+ if not language and not (medline.Article.Language.string in LANG_MAP_MARC):
warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))
### Journal/Issue Metadata