diff --git a/src/data/scripts/pipeline.sh b/src/data/scripts/pipeline.sh index ea59d81..655e0c2 100755 --- a/src/data/scripts/pipeline.sh +++ b/src/data/scripts/pipeline.sh @@ -1,13 +1,13 @@ #!/bin/bash cd ../processing -#jq -r '. | select((.pos=="noun") or (.pos=="verb") or (.pos=="adj") or (.pos=="adv")) | select((.word | test("[^a-z]"))|not) | {word: .word, type: .pos, pronunciation: .sounds[0].ipa, definitions: (try .senses|map(.glosses|join(" ")))}' \ -#wiktionary.json > wiktionary-p1.json +jq -r '. | select((.pos=="noun") or (.pos=="verb") or (.pos=="adj") or (.pos=="adv")) | select((.word | test("[^a-z]"))|not) | {word: .word, type: .pos, pronunciation: .sounds[0].ipa, definitions: (try .senses|map(.glosses|join(" ")))}' \ +wiktionary.json > wiktionary-p1.json -#jq --slurp '. | group_by(.word)[] | {word:.[0].word, pronunciation:.[0].pronunciation, meanings:[.[]|{type:.type, definitions:[try .definitions[]]| select(.!=[]) |map({(.):1})|add|keys_unsorted}]}' \ -#wiktionary-p1.json > wiktionary-p2.json +jq --slurp '. | group_by(.word)[] | {word:.[0].word, pronunciation:.[0].pronunciation, meanings:[.[]|{type:.type, definitions:[try .definitions[]]| select(.!=[]) |map({(.):1})|add|keys_unsorted}]}' \ +wiktionary-p1.json > wiktionary-p2.json -#jq --slurp '.' wiktionary-p2.json > wiktionary-p3.json +jq --slurp '.' wiktionary-p2.json > wiktionary-p3.json #extract samples timestamp=$(date +%s) @@ -16,4 +16,5 @@ jsonl=wiktionary-p2.json jq '. | select(.word=="write")' $jsonl > ../samples/write-$timestamp.json jq '. | select(.word=="terrible")' $jsonl > ../samples/terrible-$timestamp.json jq '. | select(.word=="look")' $jsonl > ../samples/look-$timestamp.json - + jq '. | select(.word=="looks")' $jsonl > ../samples/looks-$timestamp.json + jq '. | select(.word=="edges")' $jsonl > ../samples/edges-$timestamp.json