diff --git a/.gitignore b/.gitignore index b5d6694..135bd5c 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ src/data/samples src/data/processing/ prod/ *.log +src/data/unigram_freq.csv diff --git a/src/data/database b/src/data/database index c5ef65d..e69de29 100644 Binary files a/src/data/database and b/src/data/database differ diff --git a/src/data/scripts/pipeline.sh b/src/data/scripts/pipeline.sh index 2ebac2f..aac7f97 100755 --- a/src/data/scripts/pipeline.sh +++ b/src/data/scripts/pipeline.sh @@ -1,10 +1,10 @@ #!/bin/bash cd ../processing -jq -r '. | select((.pos=="noun") or (.pos=="verb") or (.pos=="adj") or (.pos=="adv")) | select((.word | test("[^a-z]"))|not) | {word: .word, type: .pos, pronunciation: .sounds[0].ipa, definitions: (try .senses|map({definition: .glosses|join(" "), form_of: (.form_of!=null)}))}' \ +jq -r '. | select((.pos=="noun") or (.pos=="verb") or (.pos=="adj") or (.pos=="adv")) | select((.word | test("[^a-z]"))|not) | {word: .word, type: .pos, pronunciation: .sounds[0].ipa, definitions: (try .senses|map({definition: .glosses|join(" "), form_of: (.form_of!=null) , plural: (try .tags catch []|any(.=="plural")) }))}' \ wiktionary.json > wiktionary-p1.json -jq --slurp '. | group_by(.word)[] | {word:.[0].word, pronunciation:.[0].pronunciation, meanings:([.[]|{type:.type, definitions:[try .definitions][]}] )}' \ +jq --slurp '. | group_by(.word)[] | {word:.[0].word, pronunciation:.[0].pronunciation, meanings:[[([.[]|{type:.type, definitions:[try .definitions][]}] | to_entries[] | [{type: .value.type, index: .key, definitions: .value.definitions}][] )] | group_by(.type) | map({type: .[0].type, index: .[0].index, definitions: [.[].definitions|select(length > 0)]}) | sort_by(.index)[] | del(.index) ] }' \ wiktionary-p1.json > wiktionary-p2.json jq --slurp '.' wiktionary-p2.json > wiktionary-p3.json