blob: 4416125404178241e4eb0e8af09b295502870086 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
from typing import Iterable, Optional
import json
from chocula.util import clean_str, clean_issn, parse_country
from chocula.common import DirectoryLoader
from chocula.database import DirectoryInfo, HomepageUrl
class IssnMetaLoader(DirectoryLoader):
"""
This is JSON-LD (-ish) scraped from portal.issn.org, filtered down to only
journals already in the corpus, or matching a couple other criteria.
Metadata we expect to get:
- high quality English title
- URLs
- country
TODO: non-english alternative titles
"""
source_slug = "issn_meta"
def open_file(self) -> Iterable:
return open(self.config.issn_meta.filepath, "r")
def parse_record(self, row) -> Optional[DirectoryInfo]:
row = json.loads(row)
info = DirectoryInfo(directory_slug=self.source_slug,)
# format is an array of metadata elements
for el in row:
if "label" in el and el["@id"].startswith(
"http://id.loc.gov/vocabulary/countries"
):
value = el["label"]
if "(State)" in value:
value = ""
if value == "Russia (Federation)":
value = "Russia"
info.country = parse_country(el["label"])
if not "@type" in el:
continue
if el["@type"] == "http://id.loc.gov/ontologies/bibframe/IssnL":
info.issnl = clean_issn(el["value"])
if "mainTitle" in el:
if type(el["mainTitle"]) == list:
info.name = clean_str(el["mainTitle"][0])
else:
info.name = clean_str(el["mainTitle"])
if el.get("format") == "vocabularies/medium#Print":
info.issnp = clean_issn(el["issn"])
elif el.get("format") == "vocabularies/medium#Electronic":
info.issne = clean_issn(el["issn"])
urls = el.get("url", [])
if isinstance(urls, str):
urls = [
urls,
]
for url in urls:
homepage = HomepageUrl.from_url(url)
if homepage:
info.homepage_urls.append(homepage)
return info
|