feat: optimize and simplify python download script

2025-04-05 15:25:40 +02:00
parent f5b2737178
commit 5071c28f4f
1 changed files with 22 additions and 26 deletions
--- a/download_data.py
+++ b/download_data.py
@@ -1,33 +1,29 @@
 import nltk
 import os

+
 def main():
-    # Load the corpora
-    nltk.download('wordnet')
-    nltk.download('averaged_perceptron_tagger')
-    nltk.download('universal_tagset')
+    nltk.download("wordnet")

-    # Get all the adjectives and nouns
-    adjectives = set()
-    nouns = set()
-    for synset in nltk.corpus.wordnet.all_synsets():
-        for lemma in synset.lemmas():
-            if lemma.name().isalpha():
-                if synset.pos() == 'a':
-                    adjectives.add(lemma.name())
-                elif synset.pos() == 'n':
-                    nouns.add(lemma.name())
-    
-    # Create the output directory if it doesn't exist
-    os.makedirs('data', exist_ok=True)
+    adjectives = {
+        lemma.name().capitalize()
+        for synset in nltk.corpus.wordnet.all_synsets(pos="a")
+        for lemma in synset.lemmas()
+        if lemma.name().isalpha()
+    }
+    nouns = {
+        lemma.name().capitalize()
+        for synset in nltk.corpus.wordnet.all_synsets(pos="n")
+        for lemma in synset.lemmas()
+        if lemma.name().isalpha()
+    }

-    with open('data/adjectives.txt', 'w+') as file:
-        for adjective in adjectives:
-            file.write(str(adjective).capitalize() + '\n')
-    
-    with open('data/nouns.txt', 'w+') as file:
-        for noun in nouns:
-            file.write(str(noun).capitalize() + '\n')
+    os.makedirs("data", exist_ok=True)
+    with open("data/adjectives.txt", "w") as adj_file:
+        adj_file.write("\n".join(sorted(adjectives)))
+    with open("data/nouns.txt", "w") as noun_file:
+        noun_file.write("\n".join(sorted(nouns)))

-if __name__ == '__main__':
-    main()
+
+if __name__ == "__main__":
+    main()