From 5071c28f4f61b2d8608ca32faf495df513839910 Mon Sep 17 00:00:00 2001 From: Rawleenc Dev Date: Sat, 5 Apr 2025 15:25:40 +0200 Subject: [PATCH] feat: optimize and simplify python download script --- download_data.py | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/download_data.py b/download_data.py index 31ecdd7..6e0f982 100644 --- a/download_data.py +++ b/download_data.py @@ -1,33 +1,29 @@ import nltk import os + def main(): - # Load the corpora - nltk.download('wordnet') - nltk.download('averaged_perceptron_tagger') - nltk.download('universal_tagset') + nltk.download("wordnet") - # Get all the adjectives and nouns - adjectives = set() - nouns = set() - for synset in nltk.corpus.wordnet.all_synsets(): - for lemma in synset.lemmas(): - if lemma.name().isalpha(): - if synset.pos() == 'a': - adjectives.add(lemma.name()) - elif synset.pos() == 'n': - nouns.add(lemma.name()) - - # Create the output directory if it doesn't exist - os.makedirs('data', exist_ok=True) + adjectives = { + lemma.name().capitalize() + for synset in nltk.corpus.wordnet.all_synsets(pos="a") + for lemma in synset.lemmas() + if lemma.name().isalpha() + } + nouns = { + lemma.name().capitalize() + for synset in nltk.corpus.wordnet.all_synsets(pos="n") + for lemma in synset.lemmas() + if lemma.name().isalpha() + } - with open('data/adjectives.txt', 'w+') as file: - for adjective in adjectives: - file.write(str(adjective).capitalize() + '\n') - - with open('data/nouns.txt', 'w+') as file: - for noun in nouns: - file.write(str(noun).capitalize() + '\n') + os.makedirs("data", exist_ok=True) + with open("data/adjectives.txt", "w") as adj_file: + adj_file.write("\n".join(sorted(adjectives))) + with open("data/nouns.txt", "w") as noun_file: + noun_file.write("\n".join(sorted(nouns))) -if __name__ == '__main__': - main() \ No newline at end of file + +if __name__ == "__main__": + main()