feat: optimize and simplify python download script

2025-04-05 15:25:40 +02:00
parent f5b2737178
commit 5071c28f4f
1 changed files with 22 additions and 26 deletions
--- a/download_data.py
+++ b/download_data.py
@@ -1,33 +1,29 @@
 import nltk
 import os
 def main():
-    # Load the corpora
+    nltk.download("wordnet")
    nltk.download('wordnet')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('universal_tagset')
-    # Get all the adjectives and nouns
+    adjectives = {
-    adjectives = set()
+        lemma.name().capitalize()
-    nouns = set()
+        for synset in nltk.corpus.wordnet.all_synsets(pos="a")
-    for synset in nltk.corpus.wordnet.all_synsets():
+        for lemma in synset.lemmas()
-        for lemma in synset.lemmas():
+        if lemma.name().isalpha()
-            if lemma.name().isalpha():
+    }
-                if synset.pos() == 'a':
+    nouns = {
-                    adjectives.add(lemma.name())
+        lemma.name().capitalize()
-                elif synset.pos() == 'n':
+        for synset in nltk.corpus.wordnet.all_synsets(pos="n")
-                    nouns.add(lemma.name())
+        for lemma in synset.lemmas()
-    
+        if lemma.name().isalpha()
-    # Create the output directory if it doesn't exist
+    }
    os.makedirs('data', exist_ok=True)
-    with open('data/adjectives.txt', 'w+') as file:
+    os.makedirs("data", exist_ok=True)
-        for adjective in adjectives:
+    with open("data/adjectives.txt", "w") as adj_file:
-            file.write(str(adjective).capitalize() + '\n')
+        adj_file.write("\n".join(sorted(adjectives)))
-    
+    with open("data/nouns.txt", "w") as noun_file:
-    with open('data/nouns.txt', 'w+') as file:
+        noun_file.write("\n".join(sorted(nouns)))
        for noun in nouns:
            file.write(str(noun).capitalize() + '\n')
-if __name__ == '__main__':
+
-    main()
+if __name__ == "__main__":
    main()