# -*- coding: utf-8 -*- import re import os from collections import Counter class FrequencyDict: def __init__(): # self.wordPattern = re.compile("((?:[a-zA-Z]+[-']?)*[a-zA-Z]+)") # ( collections.Counter ) self.frequencyDict = Counter() # , def ParseBook(self, file): if file.endswith(".txt"): self.__ParseTxtFile(file, self.__FindWordsFromContent) else: print('Warning: The file format is not supported: "%s"' %file) # txt def __ParseTxtFile(self, txtFile, contentHandler): try: with open(txtFile, 'rU') as file: for line in file: # contentHandler(line) # except Exception as e: print('Error parsing "%s"' % txtFile, e) # def __FindWordsFromContent(self, content): result = self.wordPattern.findall(content) # for word in result: word = word.lower() # self.frequencyDict[word] += 1 # # countWord , def FindMostCommonElements(self, countWord): dict = list(self.frequencyDict.items()) dict.sort(key=lambda t: t[0]) dict.sort(key=lambda t: t[1], reverse = True) return dict[0 : int(countWord)]
# -*- coding: utf-8 -*- import os class BaseWordNetItem: # def __init__(self, pathWordNetDict, excFile, indexFile): self.rule=() # . self.wordNetExcDict={} # self.wordNetIndexDict=[] # self.excFile = os.path.join(pathWordNetDict, excFile) # self.indexFile = os.path.join(pathWordNetDict, indexFile) # self.__ParseFile(self.excFile, self.__AppendExcDict) # self.__ParseFile(self.indexFile, self.__AppendIndexDict) # self.cacheWords={} # . , - , - # . # : [-][][] def __AppendExcDict(self, line): # , 2 ( - , - ). group = [item.strip() for item in line.replace("\n","").split(" ")] self.wordNetExcDict[group[0]] = group[1] # . def __AppendIndexDict(self, line): # group = [item.strip() for item in line.split(" ")] self.wordNetIndexDict.append(group[0]) # , , def __ParseFile(self, file, contentHandler): try: with open(file, 'r') as openFile: for line in openFile: contentHandler(line) # except Exception as e: raise Exception('File does not load: "%s"' %file) # . , . # def _GetDictValue(self, dict, key): try: return dict[key] except KeyError: return None # , True, False. # , , , ( ). def _IsDefined(self, word): if word in self.wordNetIndexDict: return True return False # ( ) def GetLemma(self, word): word = word.strip().lower() # if word == None: return None # , lemma = self._GetDictValue(self.cacheWords, word) if lemma != None: return lemma # , , if self._IsDefined(word): return word # , , lemma = self._GetDictValue(self.wordNetExcDict, word) if lemma != None: return lemma # , , . lemma = self._RuleNormalization(word) if lemma != None: self.cacheWords[word] = lemma # return lemma return None # ( , ) def _RuleNormalization(self, word): # , , , . for replGroup in self.rule: endWord = replGroup[0] if word.endswith(endWord): lemma = word # lemma = lemma.rstrip(endWord) # lemma += replGroup[1] # if self._IsDefined(lemma): # , , , return lemma return None
# -*- coding: utf-8 -*- from WordNet.BaseWordNetItem import BaseWordNetItem # # BaseWordNetItem class WordNetVerb(BaseWordNetItem): def __init__(self, pathToWordNetDict): # (BaseWordNetItem) BaseWordNetItem.__init__(self, pathToWordNetDict, 'verb.exc', 'index.verb') # . , "s" "" , "ies" "y" . self.rule = ( ["s" , "" ], ["ies" , "y" ], ["es" , "e" ], ["es" , "" ], ["ed" , "e" ], ["ed" , "" ], ["ing" , "e" ], ["ing" , "" ] ) # GetLemma(word) BaseWordNetItem
# -*- coding: utf-8 -*- from WordNet.BaseWordNetItem import BaseWordNetItem # # BaseWordNetItem class WordNetNoun(BaseWordNetItem): def __init__(self, pathToWordNetDict): # (BaseWordNetItem) BaseWordNetItem.__init__(self, pathToWordNetDict, 'noun.exc', 'index.noun') # . , "s" "", "ses" "s" . self.rule = ( ["s" , "" ], ["'s" , "" ], ["'" , "" ], ["ses" , "s" ], ["xes" , "x" ], ["zes" , "z" ], ["ches" , "ch" ], ["shes" , "sh" ], ["men" , "man" ], ["ies" , "y" ] ) # ( ) # BaseWordNetItem, , # def GetLemma(self, word): word = word.strip().lower() # , if len(word) <= 2: return None # "ss", if word.endswith("ss"): return None # , lemma = self._GetDictValue(self.cacheWords, word) if lemma != None: return lemma # , , if self._IsDefined(word): return word # , , lemma = self._GetDictValue(self.wordNetExcDict, word) if (lemma != None): return lemma # "ful", "ful", , . # , , "spoonsful" "spoonful" suff = "" if word.endswith("ful"): word = word[:-3] # "ful" suff = "ful" # "ful", # , , . lemma = self._RuleNormalization(word) if (lemma != None): lemma += suff # "ful", self.cacheWords[word] = lemma # return lemma return None
# -*- coding: utf-8 -*- from WordNet.BaseWordNetItem import BaseWordNetItem # # BaseWordNetItem class WordNetAdverb(BaseWordNetItem): def __init__(self, pathToWordNetDict): # (BaseWordNetItem) BaseWordNetItem.__init__(self, pathToWordNetDict, 'adv.exc', 'index.adv') # (adv.exc) (index.adv). # .
# -*- coding: utf-8 -*- from WordNet.BaseWordNetItem import BaseWordNetItem # # BaseWordNetItem class WordNetAdjective(BaseWordNetItem): def __init__(self, pathToWordNetDict): # (BaseWordNetItem) BaseWordNetItem.__init__(self, pathToWordNetDict, 'adj.exc', 'index.adj') # . , "er" "" "e" . self.rule = ( ["er" , "" ], ["er" , "e"], ["est" , "" ], ["est" , "e"] ) # GetLemma(word) BaseWordNetItem
# -*- coding: utf-8 -*- from WordNet.WordNetAdjective import WordNetAdjective from WordNet.WordNetAdverb import WordNetAdverb from WordNet.WordNetNoun import WordNetNoun from WordNet.WordNetVerb import WordNetVerb class Lemmatizer: def __init__(self, pathToWordNetDict): # self.splitter = "-" # adj = WordNetAdjective(pathToWordNetDict) # noun = WordNetNoun(pathToWordNetDict) # adverb = WordNetAdverb(pathToWordNetDict) # verb = WordNetVerb(pathToWordNetDict) # self.wordNet = [verb, noun, adj, adverb] # (, ) def GetLemma(self, word): # , , ( ) , wordArr = word.split(self.splitter) resultWord = [] for word in wordArr: lemma = self.__GetLemmaWord(word) if (lemma != None): resultWord.append(lemma) if (resultWord != None): return self.splitter.join(resultWord) return None # ( ) def __GetLemmaWord(self, word): for item in self.wordNet: lemma = item.GetLemma(word) if (lemma != None): return lemma return None
# -*- coding: utf-8 -*- import os class BaseStarDictItem: def __init__(self, pathToDict, exp): # self.encoding = "utf-8" # self.dictionaryFile = self.__PathToFileInDirByExp(pathToDict, exp) # self.realFileSize = os.path.getsize(self.dictionaryFile) # path exp def __PathToFileInDirByExp(self, path, exp): if not os.path.exists(path): raise Exception('Path "%s" does not exists' % path) end = '.%s'%(exp) list = [f for f in os.listdir(path) if f.endswith(end)] if list: return os.path.join(path, list[0]) # else: raise Exception('File does not exist: "*.%s"' % exp)
# -*- coding: utf-8 -*- from StarDict.BaseStarDictItem import BaseStarDictItem from Frequency.IniParser import IniParser class Ifo(BaseStarDictItem): def __init__(self, pathToDict): # (BaseStarDictItem) BaseStarDictItem.__init__(self, pathToDict, 'ifo') # self.iniParser = IniParser(self.dictionaryFile) # ifo # , self.bookName = self.__getParameterValue("bookname", None) # [ ] self.wordCount = self.__getParameterValue("wordcount", None) # ".idx" [ ] self.synWordCount = self.__getParameterValue("synwordcount", "") # ".syn" [ , ".syn"] self.idxFileSize = self.__getParameterValue("idxfilesize", None) # ( ) ".idx" . , [ ] self.idxOffsetBits = self.__getParameterValue("idxoffsetbits", 32) # (32 64), .dict. 3.0.0, 32 [ ] self.author = self.__getParameterValue("author", "") # [ ] self.email = self.__getParameterValue("email", "") # [ ] self.description = self.__getParameterValue("description", "") # [ ] self.date = self.__getParameterValue("date", "") # [ ] self.sameTypeSequence = self.__getParameterValue("sametypesequence", None) # , [ ] self.dictType = self.__getParameterValue("dicttype", "") # , WordNet[ ] def __getParameterValue(self, key, defaultValue): try: return self.iniParser.GetValue(key) except: if defaultValue != None: return defaultValue raise Exception('\n"%s" has invalid format (missing parameter: "%s")' % (self.dictionaryFile, key))
# -*- coding: utf-8 -*- from struct import unpack from StarDict.BaseStarDictItem import BaseStarDictItem class Idx(BaseStarDictItem): # def __init__(self, pathToDict, wordCount, idxFileSize, idxOffsetBits): # (BaseStarDictItem) BaseStarDictItem.__init__(self, pathToDict, 'idx') self.idxDict ={} # , self.idxDict = {'.': [_____dict, _____dict], ...} self.idxFileSize = int(idxFileSize) # .idx, .ifo self.idxOffsetBytes = int(idxOffsetBits/8) # , .dict. self.wordCount = int(wordCount) # ".idx" # ( .ifo .idx [idxfilesize] ) self.__CheckRealFileSize() # self.idxDict .idx self.__FillIdxDict() # ( .ifo [wordcount] .idx ) self.__CheckRealWordCount() # , .ifo , def __CheckRealFileSize(self): if self.realFileSize != self.idxFileSize: raise Exception('size of the "%s" is incorrect' %self.dictionaryFile) # , .ifo , .idx def __CheckRealWordCount(self): realWordCount = len(self.idxDict) if realWordCount != self.wordCount: raise Exception('word count of the "%s" is incorrect' %self.dictionaryFile) # , def __getIntFromByteArray(self, sizeInt, stream): byteArray = stream.read(sizeInt) # , # formatCharacter = 'L' # "unsigned long" ( sizeInt = 4) if sizeInt == 8: formatCharacter = 'Q' # "unsigned long long" ( sizeInt = 8) format = '>' + formatCharacter # : " " + " " # '>' - , int( formatCharacter) . integer = (unpack(format, byteArray))[0] # return int(integer) # .idx ( 3- ) self.idxDict def __FillIdxDict(self): languageWord = "" with open(self.dictionaryFile, 'rb') as stream: while True: byte = stream.read(1) # if not byte: break # , if byte != b'\0': # '\0', languageWord += byte.decode("utf-8") else: # '\0', , (" dict" " dict") wordDataOffset = self.__getIntFromByteArray(self.idxOffsetBytes, stream) # " dict" wordDataSize = self.__getIntFromByteArray(4, stream) # " dict" self.idxDict[languageWord] = [wordDataOffset, wordDataSize] # self.idxDict : + + languageWord = "" # , # .dict (" dict" " dict"). # , None def GetLocationWord(self, word): try: return self.idxDict[word] except KeyError: return [None, None]
# -*- coding: utf-8 -*- from StarDict.BaseStarDictItem import BaseStarDictItem # ( , sametypesequence = tm). # -x ( utf-8, '\0'): # 'm' - utf-8, '\0' # 'l' - utf-8, '\0' # 'g' - Pango # 't' - utf-8, '\0' # 'x' - utf-8, xdxf # 'y' - utf-8, (YinBiao) (KANA) # 'k' - utf-8, KingSoft PowerWord XML # 'w' - MediaWiki # 'h' - Html # 'n' - WordNet # 'r' - . (jpg), (wav), (avi), (bin) . # 'W' - wav # 'P' - # 'X' - class Dict(BaseStarDictItem): def __init__(self, pathToDict, sameTypeSequence): # (BaseStarDictItem) BaseStarDictItem.__init__(self, pathToDict, 'dict') # , self.sameTypeSequence = sameTypeSequence def GetTranslation(self, wordDataOffset, wordDataSize): try: # .dict self.__CheckValidArguments(wordDataOffset, wordDataSize) # .dict with open(self.dictionaryFile, 'rb') as file: # file.seek(wordDataOffset) # , byteArray = file.read(wordDataSize) # , return byteArray.decode(self.encoding) # o (self.encoding BaseDictionaryItem) except Exception: return None def __CheckValidArguments(self, wordDataOffset, wordDataSize): if wordDataOffset is None: pass if wordDataOffset < 0: pass endDataSize = wordDataOffset + wordDataSize if wordDataOffset < 0 or wordDataSize < 0 or endDataSize > self.realFileSize: raise Exception
# -*- coding: utf-8 -*- import os import xlwt3 as xlwt from Frequency.IniParser import IniParser from Frequency.FrequencyDict import FrequencyDict from StarDict.StarDict import StarDict ConfigFileName="Settings.ini" class Main: def __init__(self): self.listLanguageDict = [] # StarDict self.result = [] # ( , , ) try: # - config = IniParser(ConfigFileName) self.pathToBooks = config.GetValue("PathToBooks") # ini PathToBooks, (, ), self.pathResult = config.GetValue("PathToResult") # ini PathToResult, self.countWord = config.GetValue("CountWord") # ini CountWord, , self.pathToWordNetDict = config.GetValue("PathToWordNetDict") # ini PathToWordNetDict, WordNet self.pathToStarDict = config.GetValue("PathToStarDict") # ini PathToStarDict, StarDict # StarDict . listPathToStarDict listPathToStarDict = [item.strip() for item in self.pathToStarDict.split(";")] # StarDict for path in listPathToStarDict: languageDict = StarDict(path) self.listLanguageDict.append(languageDict) # , self.listBooks = self.__GetAllFiles(self.pathToBooks) # self.frequencyDict = FrequencyDict(self.pathToWordNetDict) # , StarDict WordNet. , , self.__Run() except Exception as e: print('Error: "%s"' %e) # , path def __GetAllFiles(self, path): try: return [os.path.join(path, file) for file in os.listdir(path)] except Exception: raise Exception('Path "%s" does not exists' % path) # , . , def __GetTranslate(self, word): valueWord = "" for dict in self.listLanguageDict: valueWord = dict.Translate(word) if valueWord != "": return valueWord return valueWord # ( , , ) countWord Excel def __SaveResultToExcel(self): try: if not os.path.exists(self.pathResult): raise Exception('No such directory: "%s"' %self.pathResult) if self.result: description = 'Frequency Dictionary' style = xlwt.easyxf('font: name Times New Roman') wb = xlwt.Workbook() ws = wb.add_sheet(description + ' ' + self.countWord) nRow = 0 for item in self.result: ws.write(nRow, 0, item[0], style) ws.write(nRow, 1, item[1], style) ws.write(nRow, 2, item[2], style) nRow +=1 wb.save(os.path.join(self.pathResult, description +'.xls')) except Exception as e: print(e) # def __Run(self): # for book in self.listBooks: self.frequencyDict.ParseBook(book) # countWord mostCommonElements = self.frequencyDict.FindMostCommonElements(self.countWord) # for item in mostCommonElements: word = item[0] counterWord = item[1] valueWord = self.__GetTranslate(word) self.result.append([counterWord, word, valueWord]) # Excel self.__SaveResultToExcel() if __name__ == "__main__": main = Main()
; (, ), PathToBooks = e:\Bienne\Frequency\Books ; WordNet( ) PathToWordNetDict = e:\Bienne\Frequency\WordNet\wn3.1.dict\ ; StarDict( ) PathToStarDict = e:\Bienne\Frequency\Dict\stardict-comn_dictd04_korolew ; , Excel CountWord = 100 ; , ( Excel - , , ) PathToResult = e:\Bienne\Frequency\Books
Source: https://habr.com/ru/post/161073/
All Articles