From 4be1ec0cdb382d7b545eeb4c451cc123d9199d95 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 23 Jun 2020 20:10:46 -0700 Subject: commit notes and issnl_prefix.py helper script --- extra/issnl_prefix.py | 14 +++++++ notes/awol-index.md | 38 +++++++++++++++++++ notes/longtail_crawl.txt | 21 +++++++++++ notes/missing_homepage_task.md | 84 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 157 insertions(+) create mode 100644 extra/issnl_prefix.py create mode 100644 notes/awol-index.md create mode 100644 notes/longtail_crawl.txt create mode 100644 notes/missing_homepage_task.md diff --git a/extra/issnl_prefix.py b/extra/issnl_prefix.py new file mode 100644 index 0000000..4118921 --- /dev/null +++ b/extra/issnl_prefix.py @@ -0,0 +1,14 @@ +import sys +import json + +for line in sys.stdin: + if not line.strip(): + continue + record = json.loads(line)["@graph"] + issnl = None + for el in record: + if el.get("@type") == "http://id.loc.gov/ontologies/bibframe/IssnL": + issnl = el["value"] + break + if issnl: + print("\t".join((issnl, json.dumps(record, sort_keys=True)))) diff --git a/notes/awol-index.md b/notes/awol-index.md new file mode 100644 index 0000000..a999893 --- /dev/null +++ b/notes/awol-index.md @@ -0,0 +1,38 @@ + +Original source: + +Copyright statement: + + The production and publication of The AWOL Index contributes significant + additional value both to the content itself and to its presentation and + utility. This new intellectual property is covered by copyright (2015, New + York University). The full content of The AWOL Index, both in HTML and JSON + formats, is published under the terms of a Creative Commons + Attribution-ShareAlike 4.0 International License . + +Extracting ISSN-L, Title, URL from this corpus. + +Commands: + + unzip awol-index-json.zip + fd -I .json json/ | parallel cat {} | jq . -c | pv -l > awol-index-combined.json + cat awol-index-combined.json | rg '"is_part_of":null' > awol-index-top.json + cat awol-index-top.json | rg '"issn":' > awol-index-top-issn.json + + wc -l awol-index-combined.json awol-index-top.json awol-index-top-issn.json + 52006 awol-index-combined.json + 1302 awol-index-top.json + 503 awol-index-top-issn.json + + rg '"issn":' awol-index-top.json | wc -l + 503 + + cat awol-index-combined.json | jq .identifiers.issn.generic -c | rg -v ^null | sort -u | wc -l + 753 + + cat awol-index-top.json | jq .identifiers.issn.generic -c | rg -v ^null | sort -u | wc -l + 486 + + cat awol-index-top-issn.json | jq .identifiers.issn.generic -c | rg -v ^null | sort -u | wc -l + 486 + diff --git a/notes/longtail_crawl.txt b/notes/longtail_crawl.txt new file mode 100644 index 0000000..9d2fe96 --- /dev/null +++ b/notes/longtail_crawl.txt @@ -0,0 +1,21 @@ + + .mode tabs + .output longtail_homepage_urls.tsv + + SELECT homepage.url, homepage.issnl + FROM homepage LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE homepage.terminal_status_code = 200 AND journal.is_longtail = 1 AND homepage.domain != 'archive.org' AND homepage.host NOT LIKE '%scielo%' AND homepage.domain != 'jst.go.jp' AND homepage.host != 'books.google.com' AND homepage.host != 'www.google.com' AND journal.has_dois = 0; + + +## Test Queries + + SELECT count(distinct homepage.issnl) FROM homepage LEFT JOIN journal ON homepage.issnl = journal.issnl WHERE homepage.terminal_status_code = 200 AND journal.is_longtail = 1; + + SELECT ... FROM homepage LEFT JOIN journal ON homepage.issnl = journal.issnl WHERE homepage.terminal_status_code = 200 AND journal.is_longtail = 1; + + + SELECT homepage.domain, COUNT(*) FROM homepage LEFT JOIN journal ON homepage.issnl = journal.issnl WHERE homepage.terminal_status_code = 200 AND journal.is_longtail = 1 AND homepage.domain != 'archive.org' AND homepage.host NOT LIKE '%scielo%' AND homepage.domain != 'jst.go.jp' AND homepage.host != 'books.google.com' AND homepage.host != 'www.google.com' AND journal.has_dois = 0 + GROUP BY homepage.domain ORDER BY COUNT(*) DESC LIMIT 20; + + SELECT homepage.suffix , COUNT(*) FROM homepage LEFT JOIN journal ON homepage.issnl = journal.issnl WHERE homepage.terminal_status_code = 200 AND journal.is_longtail = 1 AND homepage.domain != 'archive.org' + GROUP BY homepage.suffix ORDER BY COUNT(*) DESC LIMIT 20; diff --git a/notes/missing_homepage_task.md b/notes/missing_homepage_task.md new file mode 100644 index 0000000..8d943d0 --- /dev/null +++ b/notes/missing_homepage_task.md @@ -0,0 +1,84 @@ + +## Goal + +For many long-tail journals, we have no known homepage. It is likely many of +these metadata records were actually never published, or are otherwise bad +metadata, but many are legitimate but simply missing metadata. + +Want to rapidly skim though thousands of such journals and record homepage URLs +if they exist. + +## Instructions + +For each row in the spreadsheet, search the web or other sources for a journal +homepage. This should be an official, active site where new papers are +published, as well as historical papers. + +The recommended workflow is to search for the ISSN-L and name in google, skim +the first page for likely hits, then click through to confirm that any hits are +actually journal sites. An easy way to do this is to check for the ISSN (or the +alternate "ISSNe" or "ISSNp") in the webpage itself; we will also check for +these identifiers in an automated manner to verify homepage matches. If there +do not seem to be any hits, mark the row as skipped and move on. You will +notice that many journals are published on platforms or using common software +like OJS (Open Journal Systems), Wordpress, or SciElo. If you notice this, +please tag in the `platform` column. + +Generally are not interested in URLs to sites that are just indexing or listing +metadata about a journal, which often show up in search results. If it seems +like a journal has been retired, archived, or mirrored elsewhere, with all the +papers available, you can put such a URL in `other_url`. This is relatively +rare. + +If the metadata (journal name) is aggregiously poor or mangled, and you find +the corrected canonical title, you can put that in the `corrected_title` column +(optional). + +Recommend running through 25 random rows first without recording results to get +a feel for the process and ask any question. + +Specific platforms we don't want any URLs from (not a complete list): + +- issn.org +- sherpa.ac.uk +- any other lists of journal information +- wikidata.org +- scimago + +Platforms which are ok to link to in the `other_url` column if no other hits: + +- web.archive.org + +Core columns to fill in for each row: + +- `skipped` (yes or blank) +- `homepage_url` +- `platform` (eg, OJS, scielo, hypothesis, or blank) + +Other columns that can be filled in, but aren't expecting them for most: + +- `other_url` +- `corrected_title` +- `original_title` (non-English) +- `corrected_publisher` +- `inactive` (yes/no) +- `comment` + +## Export Task List + +Dump to TSV: + + .headers on + .mode tabs + .output chocula_missing_hompages_longtail.2020-05-05.tsv + + SELECT issnl, issnp, issne, name, publisher, country, lang, release_count + FROM journal + WHERE + any_homepage=0 + AND has_dois=0 + AND is_longtail=1 + AND release_count < 10 + AND valid_issnl=1; + +NOTE: this is a partial list, as of 2020-05-05 about 4600 rows, -- cgit v1.2.3