aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-23 15:36:14 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-23 15:36:14 -0700
commitee74b524b55ec6a8cb8120d890a07071174638d7 (patch)
tree8cb6629a2fa0b3fef3129d133932dc7b1283003b
parent8c23bec37e410defa219650e13bb5b2aa3b3c974 (diff)
downloadchocula-ee74b524b55ec6a8cb8120d890a07071174638d7.tar.gz
chocula-ee74b524b55ec6a8cb8120d890a07071174638d7.zip
new sources: issn_meta, zdb_fize
-rw-r--r--notes/issn_meta.md72
-rw-r--r--sources.toml11
2 files changed, 83 insertions, 0 deletions
diff --git a/notes/issn_meta.md b/notes/issn_meta.md
new file mode 100644
index 0000000..2aa045b
--- /dev/null
+++ b/notes/issn_meta.md
@@ -0,0 +1,72 @@
+
+content comes from martin's scraping tool
+
+plan:
+- filter down to in-scope ISSNs
+ => journal (or synonym/translation) in title?
+ => join with existing ISSN-Ls
+- key metadata
+ => country
+ => language
+ => URL
+ => correct title
+ => print/electric ISSNs
+
+Total size:
+
+ xzcat data.ndjson.xz | wc -l
+ 2,141,737
+
+ xzcat data.ndjson.xz | rg -i journal | wc -l
+ 136,696
+
+ xzcat data.ndjson.xz | rg -i '(Online)' | wc -l
+ 285,292
+
+Other in-scope keywords: "IEEE", "Proceedings"
+
+Blocklist: "Annual report", "Directory of ...", "Business directory ..."
+
+JSON linked data format:
+
+ @graph
+
+ mainTitle: sometimes an array (with different character sets)
+
+Transform to TSV with ISSN-L as first column, JSON of "@graph" as second column:
+
+ # was going to paste but decided not to do it this way
+ #xzcat data.ndjson.xz | jq '."@graph"[] | select(."@type" == "http://id.loc.gov/ontologies/bibframe/IssnL") | .value' -r > issnl_col.txt
+ #xzcat data.ndjson.xz | jq '."@graph"' > graph_col.txt
+
+ xzcat data.ndjson.xz | rg -v "org.elasticsearch.client.transport.NoNodeAvailableException" | python3 issnl_prefix.py | pv -l | sort > issn_meta_issnl_prefix.tsv
+ => 2.14M
+ => NOTE: terminated from json.decoder.JSONDecodeError: Expecting ':' delimiter
+
+ cat data/container_export.json | jq .issnl -r | rg -v ^null | sort -u > fatcat_issnl.txt
+
+ # was: cat issn_meta_issnl_prefix.tsv | rg -i "journal " > issn_meta.journal.tsv
+ cat issn_meta_issnl_prefix.tsv | rg -i "journal " | rg '"url":' | rg -iv "magazine" > issn_meta.journal.tsv
+ join -t $'\t' fatcat_issnl.txt issn_meta_issnl_prefix.tsv > issn_meta.fatcat.tsv
+ cat issn_meta.journal.tsv issn_meta.fatcat.tsv | cut -f2 | sort -u > issn_meta.filtered.json
+
+ wc -l fatcat_issnl.txt issn_meta.*.json
+ 147973 fatcat_issnl.txt
+ 216673 issn_meta.fatcat.json
+ 277076 issn_meta.filtered.json
+ 136696 issn_meta.journal.json
+
+ cat issn_meta.journal.tsv issn_meta.fatcat.tsv | cut -f1 | sort -u | wc -l
+ => 197,724
+
+Original "journal" filter would be about 50k new journals. With some narrower
+filters (no "magazine", require a URL defined):
+
+ join -t $'\t' -v 2 fatcat_issnl.txt issn_meta_issnl_prefix.tsv | rg -i "journal " | rg '"url":' | rg -iv "magazine" > issn_meta.new.tsv
+
+ wc -l issn_meta.new.tsv
+ 12819 issn_meta.new.tsv
+
+ cut -f1 issn_meta.new.tsv | sort -u | wc -l
+ 11773
+
diff --git a/sources.toml b/sources.toml
index 31b23c4..f2753c2 100644
--- a/sources.toml
+++ b/sources.toml
@@ -132,3 +132,14 @@ filename = "container_stats.json"
date = "2020-05-05"
filename = "manual_longtail_homepages.tsv"
mirror_url = "https://archive.org/download/chocula-manual-hompages"
+
+[issn_meta]
+# see notes/issn_meta.md
+date = "2020-03-18"
+filename = "issn_meta.filtered.json"
+mirror_url = "https://archive.org/download/issn_public_data_20200318"
+
+[zdb_fize]
+date = "2020-05-30"
+filename = "zdb_fize_homepage_available.json"
+mirror_url = "https://archive.org/download/issn_homepage_candidates_20200530"