📜 ⬆️ ⬇️

Python program for statistical text analysis



The task of counting the frequency of use of certain letters in English and Russian texts is one of the stages of linguistic and statistical analysis. There is no Python program in the Catalog of Linguistic Programs and Resources in the Network to solve the specified problem.

On the Python forums, there are separate parts of such a program, but they are focused on one language, mainly English. Given this circumstance, I have developed a program for statistical processing, both for Russian and for English texts.

Import and initial variables


import matplotlib.pyplot as plt; plt.rcdefaults() import numpy as np import matplotlib.pyplot as plt from tkinter import * from tkinter.filedialog import * from tkinter.messagebox import * import fileinput import matplotlib as mpl mpl.rcParams['font.family'] = 'fantasy' mpl.rcParams['font.fantasy'] = 'Comic Sans MS, Arial' 

Opening a file with English text


 def w_open_ing(): aa=ord('a') bb=ord('z') op = askopenfilename() main(op,aa,bb) 

Opening a file with Russian text


 def w_open_rus(): aa=ord('') bb=ord('') op = askopenfilename() main(op,aa,bb) 

Universal data processing for both languages


 def main(op,aa,bb): alpha = [chr(w) for w in range(aa,bb+1)] #     f = open(op , 'r') text = f.read() f.close() alpha_text = [w.lower() for w in text if w.isalpha()] #         k={} #      for i in alpha: #  alpha_count =0 for item in alpha_text: if item==i: alpha_count = alpha_count + 1 k[i]= alpha_count z=0 for i in alpha: #      z=z+k[i] a_a=[] b_b=[] t= ('|\tletter\t|\tcount\t|\tpercent,%\t\n') txt.insert(END,t) t=('|----------------------------|-----------------------------|---------------------------|\n') txt.insert(END,t) for i in alpha: #      persent = round(k[i] * 100.0 / z,2) t=( '|\t%s\t|\t%d\t|\t%s\t\n' % (i, k[i], persent)) txt.insert(END,t) a_a.append(i) b_b.append(k[i]) t=('|----------------------------|-----------------------------|---------------------------|\n' ) txt.insert(END,t) t=('Total letters: %d\n' % z) txt.insert(END,t) people=a_a #     y_pos = np.arange(len(people)) performance =b_b #     plt.barh(y_pos, performance) plt.yticks(y_pos, people) plt.xlabel('Quantity(amount) of the uses of the letter in the text') plt.title('The letters of the alphabet') plt.show() #  

Field cleaning


 def clear_text(): txt.delete(1.0, END) 

Write data from field to file


 def save_file(): save_as = asksaveasfilename() try: x =txt.get(1.0, END) f = open(save_as, "w") f.writelines(x.encode('utf8')) f.close() except: pass 

Closing the program


 def close_win(): if askyesno("Exit", "Do you want to quit?"): tk.destroy() 

Standard Interface Tkinter


 tk= Tk() main_menu = Menu(tk) tk.config(menu=main_menu) file_menu = Menu(main_menu) main_menu.add_cascade(label="Aphabet", menu=file_menu) file_menu.add_command(label="English text", command= w_open_ing) file_menu.add_command(label="Russian text", command= w_open_rus) file_menu.add_command(label="Save file", command=save_file) file_menu.add_command(label="Cleaning", command=clear_text) file_menu.add_command(label="Exit", command=close_win) txt = Text(tk, width=72,height=10,font="Arial 12",wrap=WORD) txt.pack() tk.mainloop() 

Benefits



')

Source: https://habr.com/ru/post/323252/


All Articles