diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-09 12:33:46 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-09 12:33:46 +0200 |
commit | 521f4fdfa3db52043686fdce232f402b468362ff (patch) | |
tree | 1cb7e1c4673a69f74d3dd24992457014cb1b8309 /extra/abbrev/parse_dls.py | |
parent | d2931da01a5c7d7254b5c7f4e4f8c1fa20513235 (diff) | |
download | refcat-521f4fdfa3db52043686fdce232f402b468362ff.tar.gz refcat-521f4fdfa3db52043686fdce232f402b468362ff.zip |
add abbreviation list, 117143 titles
Diffstat (limited to 'extra/abbrev/parse_dls.py')
-rw-r--r-- | extra/abbrev/parse_dls.py | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/extra/abbrev/parse_dls.py b/extra/abbrev/parse_dls.py new file mode 100644 index 0000000..625ab16 --- /dev/null +++ b/extra/abbrev/parse_dls.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python + +import fileinput +import json + +# pup 'dl json{}' < A_abrvjt.html | jq -rc .[0].children[] | grep "children" + +# {"children":[{"tag":"b"}],"tag":"dt","text":"A + U-ARCHITECTURE AND URBANISM"} +# {"children":[{"tag":"b","text":"A U-ARCHIT URBAN"}],"tag":"dd"} +# {"children":[{"tag":"b"}],"tag":"dt","text":"A CRITICAL REVIEW: LASER TECHNOLOGIES FOR DEFENSE AND SECURITY"} +# {"children":[{"tag":"b","text":"P SOC PHOTO-OPT INS"}],"tag":"dd"} +# {"children":[{"tag":"b"}],"tag":"dt","text":"A KALEIDOSCOPIC VIEW OF NATURAL RESOURCES"} +# {"children":[{"tag":"b"}],"tag":"dd"} +# {"children":[{"tag":"b"}],"tag":"dt","text":"A MIDSUMMER NIGHT'S DREAM"} +# {"children":[{"tag":"b","text":"SHAKESPEARE SURV"}],"tag":"dd"} +# {"children":[{"tag":"b"}],"tag":"dt","text":"A N A E-APPROCHE NEUROPSYCHOLOGIQUE DES APPRENTISSAGES CHEZ L ENFANT"} + +current, abbrevs = {}, [] +for i, line in enumerate(fileinput.input()): + line = line.strip() + doc = json.loads(line) + if doc.get("tag") == "dt": + if current: + abbrevs.append(current) + current = {} + current["name"] = doc.get("text") + else: + abbrev = doc["children"][0].get("text") + if abbrev: + current["abbrev"] = abbrev + +for abbrev in abbrevs: + print(json.dumps(abbrev)) |