diff --git a/src/data/database b/src/data/database index ca707b0..2dcdb44 100644 Binary files a/src/data/database and b/src/data/database differ diff --git a/src/data/scripts/db.mjs b/src/data/scripts/db.mjs index 69c28db..a88ae07 100644 --- a/src/data/scripts/db.mjs +++ b/src/data/scripts/db.mjs @@ -11,4 +11,10 @@ export const db = Knex({ return db .select("word") .from("dictionary") +} + +export const getBadWords = async (db) => { + return db + .select("word") + .from("bad_words") } \ No newline at end of file diff --git a/src/data/scripts/removeDerivatives.mjs b/src/data/scripts/flagDerivatives.mjs similarity index 86% rename from src/data/scripts/removeDerivatives.mjs rename to src/data/scripts/flagDerivatives.mjs index 9a7682e..f164daa 100644 --- a/src/data/scripts/removeDerivatives.mjs +++ b/src/data/scripts/flagDerivatives.mjs @@ -1,5 +1,4 @@ import { db, getWords } from './db.mjs' -import fs from 'fs' //const words = getWords(db) const allDefinitionsAreFormOf = (meanings) => { @@ -43,7 +42,7 @@ const sampleMeanings =[ } ] const words = await getWords(db) -const deleted = [] +const flagged = [] for (const word of words) { const res = await db('dictionary') @@ -51,11 +50,14 @@ for (const word of words) { .where('word', word.word) const meanings = JSON.parse(res[0].meanings) if(allDefinitionsAreFormOf(meanings)){ - deleted.push(word) + await db('dictionary'). + where('word', word.word) + .update('derivative',1) + flagged.push(word) } } -console.log(`${deleted.length} entries deleted`) -console.dir(deleted) +console.log(`${flagged.length} entries flagged`) +//console.dir(flagged) db.destroy() diff --git a/src/data/scripts/flagScientific.mjs b/src/data/scripts/flagScientific.mjs new file mode 100644 index 0000000..8e2683a --- /dev/null +++ b/src/data/scripts/flagScientific.mjs @@ -0,0 +1,41 @@ +import { db, getWords } from './db.mjs' + +const words = await getWords(db) +const allDefinitionsAreScientific = (meanings) => { + let scientific = 0 + let totalDefs = 0 + for (const meaning of meanings) { + for (const definition of meaning.definitions) { + totalDefs++ + if (definition.topics) { + for (const topic of definition.topics) { + if (topic === 'sciences') { + scientific++ + break + } + } + } + } + } + return scientific === totalDefs +} + +let updated = [] + +for (const word of words) { + const res = await db('dictionary') + .select('meanings') + .where('word', word.word) + const meanings = JSON.parse(res[0].meanings) + if (allDefinitionsAreScientific(meanings)) { + await db('dictionary') + .where('word', word.word) + .update('scientific', 1) + updated.push(word.word) + } +} + +console.log(`${updated.length} words with only scientific definitions found.`) + + +db.destroy() \ No newline at end of file diff --git a/src/data/scripts/pipeline.sh b/src/data/scripts/pipeline.sh index f72db86..7dff21c 100755 --- a/src/data/scripts/pipeline.sh +++ b/src/data/scripts/pipeline.sh @@ -1,7 +1,7 @@ #!/bin/bash cd ../processing -jq -r '. | select((.pos=="noun") or (.pos=="verb") or (.pos=="adj") or (.pos=="adv")) | select((.word | test("[^a-z]"))|not) | {word: .word, type: .pos, pronunciation: .sounds[0].ipa, definitions: (try .senses|map({definition: .glosses|join(" "), form_of: (.form_of!=null) }))}' \ +jq -r '. | select((.pos=="noun") or (.pos=="verb") or (.pos=="adj") or (.pos=="adv")) | select((.word | test("[^a-z]"))|not) | {word: .word, type: .pos, pronunciation: .sounds[0].ipa, definitions: (try .senses|map({definition: .glosses|join(" "), form_of: (.form_of!=null), topics:(try .topics) }))}' \ wiktionary.json > wiktionary-p1.json jq --slurp '. | group_by(.word)[] | {word:.[0].word, pronunciation:.[0].pronunciation, meanings:[[([.[]|{type:.type, definitions:[try .definitions][]}] | to_entries[] | [{type: .value.type, index: .key, definitions: .value.definitions}][] )] | group_by(.type) | map({type: .[0].type, index: .[0].index, definitions: [.[].definitions|select(length > 0)[]]}) | sort_by(.index)[] | del(.index) ] }' \