From 9f8037134e809e48c627b4b836f88ae4de8b1ee5 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 24 May 2019 14:59:03 -0700 Subject: more MARC languages, and less verbose reporting --- python/fatcat_tools/importers/common.py | 17 ++++++++++++++--- python/fatcat_tools/importers/pubmed.py | 2 +- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 690c7902..dee60947 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -23,6 +23,7 @@ SANE_MAX_URLS = 100 # These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of # 2/T and 2/B? # PubMed/MEDLINE and JSTOR use these MARC codes +# https://www.loc.gov/marc/languages/language_name.html LANG_MAP_MARC = { 'afr': 'af', 'alb': 'sq', @@ -64,7 +65,6 @@ LANG_MAP_MARC = { 'mal': 'ml', 'mao': 'mi', 'may': 'ms', -# mul, Multiple languages 'nor': 'no', 'per': 'fa', 'per': 'fa', @@ -82,7 +82,6 @@ LANG_MAP_MARC = { 'tha': 'th', 'tur': 'tr', 'ukr': 'uk', -# und, Undetermined 'urd': 'ur', 'vie': 'vi', 'wel': 'cy', @@ -90,7 +89,19 @@ LANG_MAP_MARC = { # additions 'gle': 'ga', # "Irish" (Gaelic) 'jav': 'jv', # Javanese - # 'map', for Austronesian languages, has no ISO 639-1 code + 'welsh': 'cy', # Welsh + 'oci': 'oc', # Occitan + +# Don't have ISO 639-1 codes + 'grc': 'el', # Ancient Greek; map to modern greek + 'map': None, # Austronesian (collection) + 'syr': None, # Syriac, Modern + 'gem': None, # Old Saxon + 'non': None, # Old Norse + 'emg': None, # Eastern Meohang + 'neg': None, # Negidal + 'mul': None, # Multiple languages + 'und': None, # Undetermined } diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index d31e2f60..1246bf6b 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -448,7 +448,7 @@ class PubmedImporter(EntityImporter): language = None else: language = LANG_MAP_MARC.get(language) - if not language: + if not language and not (medline.Article.Language.string in LANG_MAP_MARC): warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string)) ### Journal/Issue Metadata -- cgit v1.2.3