Python program for statistical text analysis

The task of counting the frequency of use of certain letters in English and Russian texts is one of the stages of linguistic and statistical analysis. There is no Python program in the Catalog of Linguistic Programs and Resources in the Network to solve the specified problem.

On the Python forums, there are separate parts of such a program, but they are focused on one language, mainly English. Given this circumstance, I have developed a program for statistical processing, both for Russian and for English texts.

Import and initial variables

import matplotlib.pyplot as plt; plt.rcdefaults() import numpy as np import matplotlib.pyplot as plt from tkinter import * from tkinter.filedialog import * from tkinter.messagebox import * import fileinput import matplotlib as mpl mpl.rcParams['font.family'] = 'fantasy' mpl.rcParams['font.fantasy'] = 'Comic Sans MS, Arial'

Opening a file with English text

 def w_open_ing(): aa=ord('a') bb=ord('z') op = askopenfilename() main(op,aa,bb)

Opening a file with Russian text

 def w_open_rus(): aa=ord('') bb=ord('') op = askopenfilename() main(op,aa,bb)

Universal data processing for both languages

 def main(op,aa,bb): alpha = [chr(w) for w in range(aa,bb+1)] #     f = open(op , 'r') text = f.read() f.close() alpha_text = [w.lower() for w in text if w.isalpha()] #         k={} #      for i in alpha: #  alpha_count =0 for item in alpha_text: if item==i: alpha_count = alpha_count + 1 k[i]= alpha_count z=0 for i in alpha: #      z=z+k[i] a_a=[] b_b=[] t= ('|\tletter\t|\tcount\t|\tpercent,%\t\n') txt.insert(END,t) t=('|----------------------------|-----------------------------|---------------------------|\n') txt.insert(END,t) for i in alpha: #      persent = round(k[i] * 100.0 / z,2) t=( '|\t%s\t|\t%d\t|\t%s\t\n' % (i, k[i], persent)) txt.insert(END,t) a_a.append(i) b_b.append(k[i]) t=('|----------------------------|-----------------------------|---------------------------|\n' ) txt.insert(END,t) t=('Total letters: %d\n' % z) txt.insert(END,t) people=a_a #     y_pos = np.arange(len(people)) performance =b_b #     plt.barh(y_pos, performance) plt.yticks(y_pos, people) plt.xlabel('Quantity(amount) of the uses of the letter in the text') plt.title('The letters of the alphabet') plt.show() #

Field cleaning

 def clear_text(): txt.delete(1.0, END)

Write data from field to file

 def save_file(): save_as = asksaveasfilename() try: x =txt.get(1.0, END) f = open(save_as, "w") f.writelines(x.encode('utf8')) f.close() except: pass

Closing the program

 def close_win(): if askyesno("Exit", "Do you want to quit?"): tk.destroy()

Standard Interface Tkinter

 tk= Tk() main_menu = Menu(tk) tk.config(menu=main_menu) file_menu = Menu(main_menu) main_menu.add_cascade(label="Aphabet", menu=file_menu) file_menu.add_command(label="English text", command= w_open_ing) file_menu.add_command(label="Russian text", command= w_open_rus) file_menu.add_command(label="Save file", command=save_file) file_menu.add_command(label="Cleaning", command=clear_text) file_menu.add_command(label="Exit", command=close_win) txt = Text(tk, width=72,height=10,font="Arial 12",wrap=WORD) txt.pack() tk.mainloop()

Benefits

The program is written in Python, which simplifies its use in BigARTM and Gensim.
Takes into account the difference of the Russian letters "e" and "e".
It has a graphical interface and at the same time "distributes freely."

Source: https://habr.com/ru/post/323252/

All Articles