jq fine-tuned

This commit is contained in:
Andrzej Stepien 2023-08-03 12:19:01 +02:00
parent e028973506
commit c4c139bd66
3 changed files with 3 additions and 2 deletions

1
.gitignore vendored
View File

@ -4,3 +4,4 @@ src/data/samples
src/data/processing/ src/data/processing/
prod/ prod/
*.log *.log
src/data/unigram_freq.csv

Binary file not shown.

View File

@ -1,10 +1,10 @@
#!/bin/bash #!/bin/bash
cd ../processing cd ../processing
jq -r '. | select((.pos=="noun") or (.pos=="verb") or (.pos=="adj") or (.pos=="adv")) | select((.word | test("[^a-z]"))|not) | {word: .word, type: .pos, pronunciation: .sounds[0].ipa, definitions: (try .senses|map({definition: .glosses|join(" "), form_of: (.form_of!=null)}))}' \ jq -r '. | select((.pos=="noun") or (.pos=="verb") or (.pos=="adj") or (.pos=="adv")) | select((.word | test("[^a-z]"))|not) | {word: .word, type: .pos, pronunciation: .sounds[0].ipa, definitions: (try .senses|map({definition: .glosses|join(" "), form_of: (.form_of!=null) , plural: (try .tags catch []|any(.=="plural")) }))}' \
wiktionary.json > wiktionary-p1.json wiktionary.json > wiktionary-p1.json
jq --slurp '. | group_by(.word)[] | {word:.[0].word, pronunciation:.[0].pronunciation, meanings:([.[]|{type:.type, definitions:[try .definitions][]}] )}' \ jq --slurp '. | group_by(.word)[] | {word:.[0].word, pronunciation:.[0].pronunciation, meanings:[[([.[]|{type:.type, definitions:[try .definitions][]}] | to_entries[] | [{type: .value.type, index: .key, definitions: .value.definitions}][] )] | group_by(.type) | map({type: .[0].type, index: .[0].index, definitions: [.[].definitions|select(length > 0)]}) | sort_by(.index)[] | del(.index) ] }' \
wiktionary-p1.json > wiktionary-p2.json wiktionary-p1.json > wiktionary-p2.json
jq --slurp '.' wiktionary-p2.json > wiktionary-p3.json jq --slurp '.' wiktionary-p2.json > wiktionary-p3.json