diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-06-21 20:03:45 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-06-21 20:03:45 +0200 |
commit | 1d24518ddd1b61d8291af2b8ca5b1a5ac7ef705b (patch) | |
tree | 05dad31420f1142f143209f0cfe36e2e8249f8a0 | |
parent | e82983633b58ead6ed2ce82e36a17af227d5f5ed (diff) | |
download | refcat-1d24518ddd1b61d8291af2b8ca5b1a5ac7ef705b.tar.gz refcat-1d24518ddd1b61d8291af2b8ca5b1a5ac7ef705b.zip |
update script, notes
-rw-r--r-- | extra/cdx/cdx_query.py | 16 | ||||
-rw-r--r-- | python/notes/version_4.md | 20 | ||||
-rw-r--r-- | python/refcat/tasks.py | 2 |
3 files changed, 29 insertions, 9 deletions
diff --git a/extra/cdx/cdx_query.py b/extra/cdx/cdx_query.py index 9b7a1f0..b41ae53 100644 --- a/extra/cdx/cdx_query.py +++ b/extra/cdx/cdx_query.py @@ -9,31 +9,29 @@ import requests import sys Cdx = collections.namedtuple("Cdx", "surt date url mime status hash size") +CdxApi = "http://web.archive.org/cdx/search/cdx" +okStatus = ('200', '301', '302', '303') def parse_cdx_lines(blob): result = [] lines = blob.split("\n") - cutoff = ["access", "Access", "abgeruf", "aufgeruf"] for line in lines: - for c in cutoff: - if c in line: - line = line[:line.index(c)] - break fields = line.strip().split() if len(fields) == 0: continue cdx = Cdx(*fields) result.append(cdx) result = sorted(result, key=lambda cdx: cdx.date, reverse=True) - result = list(filter(lambda cdx: cdx.status in ("200", "301", "302", "303"), result)) + result = list(filter(lambda cdx: cdx.status in okStatus, result)) return result + def main(): stats = collections.Counter() for line in fileinput.input(): line = line.strip() - r = requests.get("http://web.archive.org/cdx/search/cdx?url={}".format(line)) + r = requests.get("{}?url={}".format(CdxApi, line)) if not r.ok: continue try: @@ -49,9 +47,9 @@ def main(): stats["ok"] += 1 print("OK\t{}\t{}".format(cdx_lines[0].date, line)) - return stats -if __name__ == '__main__': + +if __name__ == "__main__": stats = main() print(stats) diff --git a/python/notes/version_4.md b/python/notes/version_4.md index e504b2a..2e273f8 100644 --- a/python/notes/version_4.md +++ b/python/notes/version_4.md @@ -821,3 +821,23 @@ all duplicates, e.g. when the indices are different, but the reference is actually the same. Would need to "uniq" tool for the whole ref blob or something like that. + +---- + +## QA: duplicates + +There seem to be many self-links in the dataset: + +* sample: 25668733, duplicate rows: 1913155; about 8% (although only 145030 uniq; many repetitions) + +``` +$ LC_ALL=C awk '$1 == $2' bref_tabs.tsv # .... +56fbxcue6rdxlmxqto7vibg2xi 56fbxcue6rdxlmxqto7vibg2xi exact doi crossref +o2juqzskxzdtpbait5gxg3yf4q o2juqzskxzdtpbait5gxg3yf4q exact doi crossref +6mwdlhvbljgtdntz5qifywhsn4 6mwdlhvbljgtdntz5qifywhsn4 exact doi crossref +t7vluqxmgbe4pipf4nkfcayedq t7vluqxmgbe4pipf4nkfcayedq exact doi crossref +iofm6brptvczlnrys5vxw34x3i iofm6brptvczlnrys5vxw34x3i exact doi crossref +soa44abzivcnfnsx4ymxvbyg44 soa44abzivcnfnsx4ymxvbyg44 exact doi crossref +7fs4c3u2ofcmxie344o5e4wuxi 7fs4c3u2ofcmxie344o5e4wuxi exact doi crossref +igyewr6er5epfozhk7dyfqa5tu igyewr6er5epfozhk7dyfqa5tu exact doi crossref +``` diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index bf479f0..78ac68c 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -1237,6 +1237,8 @@ class Bref(Refcat): class BrefSortedByWorkID(Refcat): """ Sort by work id. 237m45.094s. + + Final file currently has: 915168340 docs. """ def requires(self): return Bref() |