From 521f4fdfa3db52043686fdce232f402b468362ff Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sun, 9 May 2021 12:33:46 +0200 Subject: add abbreviation list, 117143 titles --- extra/abbrev/parse_dls.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 extra/abbrev/parse_dls.py (limited to 'extra/abbrev/parse_dls.py') diff --git a/extra/abbrev/parse_dls.py b/extra/abbrev/parse_dls.py new file mode 100644 index 0000000..625ab16 --- /dev/null +++ b/extra/abbrev/parse_dls.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python + +import fileinput +import json + +# pup 'dl json{}' < A_abrvjt.html | jq -rc .[0].children[] | grep "children" + +# {"children":[{"tag":"b"}],"tag":"dt","text":"A + U-ARCHITECTURE AND URBANISM"} +# {"children":[{"tag":"b","text":"A U-ARCHIT URBAN"}],"tag":"dd"} +# {"children":[{"tag":"b"}],"tag":"dt","text":"A CRITICAL REVIEW: LASER TECHNOLOGIES FOR DEFENSE AND SECURITY"} +# {"children":[{"tag":"b","text":"P SOC PHOTO-OPT INS"}],"tag":"dd"} +# {"children":[{"tag":"b"}],"tag":"dt","text":"A KALEIDOSCOPIC VIEW OF NATURAL RESOURCES"} +# {"children":[{"tag":"b"}],"tag":"dd"} +# {"children":[{"tag":"b"}],"tag":"dt","text":"A MIDSUMMER NIGHT'S DREAM"} +# {"children":[{"tag":"b","text":"SHAKESPEARE SURV"}],"tag":"dd"} +# {"children":[{"tag":"b"}],"tag":"dt","text":"A N A E-APPROCHE NEUROPSYCHOLOGIQUE DES APPRENTISSAGES CHEZ L ENFANT"} + +current, abbrevs = {}, [] +for i, line in enumerate(fileinput.input()): + line = line.strip() + doc = json.loads(line) + if doc.get("tag") == "dt": + if current: + abbrevs.append(current) + current = {} + current["name"] = doc.get("text") + else: + abbrev = doc["children"][0].get("text") + if abbrev: + current["abbrev"] = abbrev + +for abbrev in abbrevs: + print(json.dumps(abbrev)) -- cgit v1.2.3