micro365/src/data/scripts/pipeline.sh

21 lines
1.1 KiB
Bash
Raw Normal View History

2023-07-17 16:57:56 +00:00
#!/bin/bash
cd ../processing
2023-07-18 21:55:33 +00:00
jq -r '. | select((.pos=="noun") or (.pos=="verb") or (.pos=="adj") or (.pos=="adv")) | select((.word | test("[^a-z]"))|not) | {word: .word, type: .pos, pronunciation: .sounds[0].ipa, definitions: .senses|map(.glosses)}' \
2023-07-17 16:57:56 +00:00
wiktionary.json > wiktionary-processed.json
jq --slurp '.' wiktionary-processed.json > wiktionary-processed-array.json
2023-07-18 21:55:33 +00:00
jq '. | group_by(.word)[] | {word:.[0].word, pronunciation:.[0].pronunciation, meanings:[.[]|{type:.type, definitions:[try .definitions[][]]| select(.!=[]) |map({(.):1})|add|keys_unsorted}]}' \
2023-07-17 16:57:56 +00:00
wiktionary-processed-array.json > wiktionary-grouped-objects.json
jq --slurp '.' wiktionary-grouped-objects.json > wiktionary-grouped-objects-array.json
2023-07-17 17:33:35 +00:00
#extract samples
timestamp=$(date +%s)
jq '. | select(.word=="chocolate")' wiktionary-grouped-objects.json > ../samples/chocolate-$timestamp.json
2023-07-18 21:55:33 +00:00
jq '. | select(.word=="write")' wiktionary-grouped-objects.json > ../samples/write-$timestamp.json
jq '. | select(.word=="terrible")' wiktionary-grouped-objects.json > ../samples/terrible-$timestamp.json
jq '. | select(.word=="look")' wiktionary-grouped-objects.json > ../samples/look-$timestamp.json
2023-07-17 17:33:35 +00:00