From ee74b524b55ec6a8cb8120d890a07071174638d7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 23 Jun 2020 15:36:14 -0700 Subject: new sources: issn_meta, zdb_fize --- notes/issn_meta.md | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ sources.toml | 11 +++++++++ 2 files changed, 83 insertions(+) create mode 100644 notes/issn_meta.md diff --git a/notes/issn_meta.md b/notes/issn_meta.md new file mode 100644 index 0000000..2aa045b --- /dev/null +++ b/notes/issn_meta.md @@ -0,0 +1,72 @@ + +content comes from martin's scraping tool + +plan: +- filter down to in-scope ISSNs + => journal (or synonym/translation) in title? + => join with existing ISSN-Ls +- key metadata + => country + => language + => URL + => correct title + => print/electric ISSNs + +Total size: + + xzcat data.ndjson.xz | wc -l + 2,141,737 + + xzcat data.ndjson.xz | rg -i journal | wc -l + 136,696 + + xzcat data.ndjson.xz | rg -i '(Online)' | wc -l + 285,292 + +Other in-scope keywords: "IEEE", "Proceedings" + +Blocklist: "Annual report", "Directory of ...", "Business directory ..." + +JSON linked data format: + + @graph + + mainTitle: sometimes an array (with different character sets) + +Transform to TSV with ISSN-L as first column, JSON of "@graph" as second column: + + # was going to paste but decided not to do it this way + #xzcat data.ndjson.xz | jq '."@graph"[] | select(."@type" == "http://id.loc.gov/ontologies/bibframe/IssnL") | .value' -r > issnl_col.txt + #xzcat data.ndjson.xz | jq '."@graph"' > graph_col.txt + + xzcat data.ndjson.xz | rg -v "org.elasticsearch.client.transport.NoNodeAvailableException" | python3 issnl_prefix.py | pv -l | sort > issn_meta_issnl_prefix.tsv + => 2.14M + => NOTE: terminated from json.decoder.JSONDecodeError: Expecting ':' delimiter + + cat data/container_export.json | jq .issnl -r | rg -v ^null | sort -u > fatcat_issnl.txt + + # was: cat issn_meta_issnl_prefix.tsv | rg -i "journal " > issn_meta.journal.tsv + cat issn_meta_issnl_prefix.tsv | rg -i "journal " | rg '"url":' | rg -iv "magazine" > issn_meta.journal.tsv + join -t $'\t' fatcat_issnl.txt issn_meta_issnl_prefix.tsv > issn_meta.fatcat.tsv + cat issn_meta.journal.tsv issn_meta.fatcat.tsv | cut -f2 | sort -u > issn_meta.filtered.json + + wc -l fatcat_issnl.txt issn_meta.*.json + 147973 fatcat_issnl.txt + 216673 issn_meta.fatcat.json + 277076 issn_meta.filtered.json + 136696 issn_meta.journal.json + + cat issn_meta.journal.tsv issn_meta.fatcat.tsv | cut -f1 | sort -u | wc -l + => 197,724 + +Original "journal" filter would be about 50k new journals. With some narrower +filters (no "magazine", require a URL defined): + + join -t $'\t' -v 2 fatcat_issnl.txt issn_meta_issnl_prefix.tsv | rg -i "journal " | rg '"url":' | rg -iv "magazine" > issn_meta.new.tsv + + wc -l issn_meta.new.tsv + 12819 issn_meta.new.tsv + + cut -f1 issn_meta.new.tsv | sort -u | wc -l + 11773 + diff --git a/sources.toml b/sources.toml index 31b23c4..f2753c2 100644 --- a/sources.toml +++ b/sources.toml @@ -132,3 +132,14 @@ filename = "container_stats.json" date = "2020-05-05" filename = "manual_longtail_homepages.tsv" mirror_url = "https://archive.org/download/chocula-manual-hompages" + +[issn_meta] +# see notes/issn_meta.md +date = "2020-03-18" +filename = "issn_meta.filtered.json" +mirror_url = "https://archive.org/download/issn_public_data_20200318" + +[zdb_fize] +date = "2020-05-30" +filename = "zdb_fize_homepage_available.json" +mirror_url = "https://archive.org/download/issn_homepage_candidates_20200530" -- cgit v1.2.3