#!/usr/bin/env python import fileinput import json # pup 'dl json{}' < A_abrvjt.html | jq -rc .[0].children[] | grep "children" # {"children":[{"tag":"b"}],"tag":"dt","text":"A + U-ARCHITECTURE AND URBANISM"} # {"children":[{"tag":"b","text":"A U-ARCHIT URBAN"}],"tag":"dd"} # {"children":[{"tag":"b"}],"tag":"dt","text":"A CRITICAL REVIEW: LASER TECHNOLOGIES FOR DEFENSE AND SECURITY"} # {"children":[{"tag":"b","text":"P SOC PHOTO-OPT INS"}],"tag":"dd"} # {"children":[{"tag":"b"}],"tag":"dt","text":"A KALEIDOSCOPIC VIEW OF NATURAL RESOURCES"} # {"children":[{"tag":"b"}],"tag":"dd"} # {"children":[{"tag":"b"}],"tag":"dt","text":"A MIDSUMMER NIGHT'S DREAM"} # {"children":[{"tag":"b","text":"SHAKESPEARE SURV"}],"tag":"dd"} # {"children":[{"tag":"b"}],"tag":"dt","text":"A N A E-APPROCHE NEUROPSYCHOLOGIQUE DES APPRENTISSAGES CHEZ L ENFANT"} current, abbrevs = {}, [] for i, line in enumerate(fileinput.input()): line = line.strip() doc = json.loads(line) if doc.get("tag") == "dt": if current: abbrevs.append(current) current = {} current["name"] = doc.get("text") else: abbrev = doc["children"][0].get("text") if abbrev: current["abbrev"] = abbrev for abbrev in abbrevs: print(json.dumps(abbrev))