From 7281e50aad337adbb49c7bffe646a6bae439876e Mon Sep 17 00:00:00 2001 From: Andrzej Stepien Date: Wed, 19 Jul 2023 15:10:39 +0200 Subject: [PATCH] definitions now concatenated, ordered correctly, --- src/data/scripts/pipeline.sh | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/data/scripts/pipeline.sh b/src/data/scripts/pipeline.sh index 91a7faa..ea59d81 100755 --- a/src/data/scripts/pipeline.sh +++ b/src/data/scripts/pipeline.sh @@ -1,20 +1,19 @@ #!/bin/bash cd ../processing -jq -r '. | select((.pos=="noun") or (.pos=="verb") or (.pos=="adj") or (.pos=="adv")) | select((.word | test("[^a-z]"))|not) | {word: .word, type: .pos, pronunciation: .sounds[0].ipa, definitions: .senses|map(.glosses)}' \ -wiktionary.json > wiktionary-processed.json +#jq -r '. | select((.pos=="noun") or (.pos=="verb") or (.pos=="adj") or (.pos=="adv")) | select((.word | test("[^a-z]"))|not) | {word: .word, type: .pos, pronunciation: .sounds[0].ipa, definitions: (try .senses|map(.glosses|join(" ")))}' \ +#wiktionary.json > wiktionary-p1.json -jq --slurp '.' wiktionary-processed.json > wiktionary-processed-array.json +#jq --slurp '. | group_by(.word)[] | {word:.[0].word, pronunciation:.[0].pronunciation, meanings:[.[]|{type:.type, definitions:[try .definitions[]]| select(.!=[]) |map({(.):1})|add|keys_unsorted}]}' \ +#wiktionary-p1.json > wiktionary-p2.json -jq '. | group_by(.word)[] | {word:.[0].word, pronunciation:.[0].pronunciation, meanings:[.[]|{type:.type, definitions:[try .definitions[][]]| select(.!=[]) |map({(.):1})|add|keys_unsorted}]}' \ -wiktionary-processed-array.json > wiktionary-grouped-objects.json - -jq --slurp '.' wiktionary-grouped-objects.json > wiktionary-grouped-objects-array.json +#jq --slurp '.' wiktionary-p2.json > wiktionary-p3.json #extract samples timestamp=$(date +%s) - jq '. | select(.word=="chocolate")' wiktionary-grouped-objects.json > ../samples/chocolate-$timestamp.json - jq '. | select(.word=="write")' wiktionary-grouped-objects.json > ../samples/write-$timestamp.json - jq '. | select(.word=="terrible")' wiktionary-grouped-objects.json > ../samples/terrible-$timestamp.json - jq '. | select(.word=="look")' wiktionary-grouped-objects.json > ../samples/look-$timestamp.json +jsonl=wiktionary-p2.json + jq '. | select(.word=="chocolate")' $jsonl > ../samples/chocolate-$timestamp.json + jq '. | select(.word=="write")' $jsonl > ../samples/write-$timestamp.json + jq '. | select(.word=="terrible")' $jsonl > ../samples/terrible-$timestamp.json + jq '. | select(.word=="look")' $jsonl > ../samples/look-$timestamp.json