
titles =[ " WikiLeaks", " , ", " 19 ", " Wikileaks ", " ", " Wikileaks", " ", " WikiLeaks, , ", " " ] numpy and scipy bring the words to their original form using nltk. Installing ...pip install numpy
pip install nltk
pip install scipyscipy encounter any problems (it will require to install a BLASS), it will probably help.apt-get install gfortran libopenblas-dev liblapack-dev
class LSI(object): def __init__(self, stopwords, ignorechars, docs): # , self.wdict = {} # dictionary - self.dictionary = [] # , , self.stopwords = stopwords if type(ignorechars) == unicode: ignorechars = ignorechars.encode('utf-8') self.ignorechars = ignorechars # for doc in docs: self.add_doc(doc) def dic(self, word, add = False): if type(word) == unicode: word = word.encode('utf-8') # word = word.lower().translate(None, self.ignorechars) word = word.decode('utf-8') # word = stemmer.stem(word) # if word in self.dictionary: return self.dictionary.index(word) else: # if add: #self.ready = False self.dictionary.append(word) return len(self.dictionary) - 1 else: return None def build(self): # self.keys = [k for k in self.wdict.keys() if len(self.wdict[k]) > 0] self.keys.sort() # self.A = zeros([len(self.keys), len(self.docs)]) # for i, k in enumerate(self.keys): for d in self.wdict[k]: self.A[i,d] += 1 def calc(self): """ U, S Vt - """ self.U, self.S, self.Vt = svd(self.A) def TFIDF(self): # - wordsPerDoc = sum(self.A, axis=0) # docsPerWord = sum(asarray(self.A > 0, 'i'), axis=1) rows, cols = self.A.shape for i in range(rows): for j in range(cols): self.A[i,j] = (self.A[i,j] / wordsPerDoc[j]) * log(float(cols) / docsPerWord[i]) def find(self, word): self.prepare() idx = self.dic(word) if not idx: print ' ' return [] if not idx in self.keys: print ' stopwords' return [] idx = self.keys.index(idx) print 'word --- ', word, '=', self.dictionary[self.keys[idx]], '.\n' # wx, wy = (-1 * self.U[:, 1:3])[idx] print 'word {}\t{:0.2f}\t{:0.2f}\t{}\n'.format(idx, wx, wy, word) arts = [] xx, yy = -1 * self.Vt[1:3, :] for k, v in enumerate(self.docs): # ax, ay = xx[k], yy[k] # dx, dy = float(wx - ax), float(wy - ay) arts.append((k, v, ax, ay, sqrt(dx * dx + dy * dy))) # return sorted(arts, key = lambda a: a[4]) class LSI(object): def __init__(self, stopwords, ignorechars, docs): self.wdict = {} self.dictionary = [] self.stopwords = stopwords if type(ignorechars) == unicode: ignorechars = ignorechars.encode('utf-8') self.ignorechars = ignorechars for doc in docs: self.add_doc(doc) def prepare(self): self.build() self.calc() def dic(self, word, add = False): if type(word) == unicode: word = word.encode('utf-8') word = word.lower().translate(None, self.ignorechars) word = word.decode('utf-8') word = stemmer.stem(word) if word in self.dictionary: return self.dictionary.index(word) else: if add: self.dictionary.append(word) return len(self.dictionary) - 1 else: return None def add_doc(self, doc): words = [self.dic(word, True) for word in doc.lower().split()] self.docs.append(words) for word in words: if word in self.stopwords: continue elif word in self.wdict: self.wdict[word].append(len(self.docs) - 1) else: self.wdict[word] = [len(self.docs) - 1] def build(self): self.keys = [k for k in self.wdict.keys() if len(self.wdict[k]) > 0] self.keys.sort() self.A = zeros([len(self.keys), len(self.docs)]) for i, k in enumerate(self.keys): for d in self.wdict[k]: self.A[i,d] += 1 def calc(self): self.U, self.S, self.Vt = svd(self.A) def TFIDF(self): wordsPerDoc = sum(self.A, axis=0) docsPerWord = sum(asarray(self.A > 0, 'i'), axis=1) rows, cols = self.A.shape for i in range(rows): for j in range(cols): self.A[i,j] = (self.A[i,j] / wordsPerDoc[j]) * log(float(cols) / docsPerWord[i]) def dump_src(self): self.prepare() print ' ' for i, row in enumerate(self.A): print self.dictionary[i], row def print_svd(self): self.prepare() print ' ' print self.S print ' 3 U ' for i, row in enumerate(self.U): print self.dictionary[self.keys[i]], row[0:3] print ' 3 Vt ' print -1*self.Vt[0:3, :] def find(self, word): self.prepare() idx = self.dic(word) if not idx: print ' ' return [] if not idx in self.keys: print ' stopwords' return [] idx = self.keys.index(idx) print 'word --- ', word, '=', self.dictionary[self.keys[idx]], '.\n' # wx, wy = (-1 * self.U[:, 1:3])[idx] print 'word {}\t{:0.2f}\t{:0.2f}\t{}\n'.format(idx, wx, wy, word) arts = [] xx, yy = -1 * self.Vt[1:3, :] for k, v in enumerate(self.docs): ax, ay = xx[k], yy[k] dx, dy = float(wx - ax), float(wy - ay) arts.append((k, v, ax, ay, sqrt(dx * dx + dy * dy))) return sorted(arts, key = lambda a: a[4]) docs =[ " WikiLeaks", " , ", " 19 ", " Wikileaks ", " ", " Wikileaks", " ", " WikiLeaks, , ", " " ] ignorechars = ''',:'!''' word = "" lsa = LSI([], ignorechars, docs) lsa.build() lsa.dump_src() lsa.calc() lsa.print_svd() for res in lsa.find(word): print res[0], res[4], res[1], docs[res[0]] lsa.dump_src () British [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] polits [1. 0. 0. 0. 0. 0. 0. 0. 1. 0.] knows [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] ...
lsa.print_svd () here are the first 3 columns of the u matrix British [-0.06333698 -0.08969849 0.03023127] police [-0.14969793 -0.20853416 0.07106177] knows [-0.06333698 -0.08969849 0.03023127] ... Here are the first 3 lines of the Vt matrix [[0.25550481 0.47069418 0.27633104 0.39579252 0.21466192 0.26635401 0.32757769 0.3483847 0.3666749] [0.34469126 -0.18334417 -0.36995197 0.37444485 -0.29101203 0.27916372 -0.26791709 0.45665895 -0.35715836] [-0.10950444 0.64280654 -0.39672464 -0.1011325 -0.36012511 -0.01213328 0.38644373 -0.14789727 -0.32579232]]
for res in lsa.find (word): print res [0], res [4], res [1], docs [res [0]] word 9 (word code in the dictionary) -0.17 (first coordinate of the word) 0.46 (second coordinate) United States (the word itself) document number in list | distance | document decomposed on codes | the document itself 6 0.127328977215 [35, 36, 9, 37, 38, 39, 23, 40, 12, 41] NATO and the United States developed plans for the defense of the Baltic States against Russia 1 0.182108022464 [7, 8, 9, 9, 10, 11, 12, 13, 14, 15] The trial against the Russian who sent out spam begins in the US court 5 0.649492914495 [31, 8, 32, 33, 34, 5, 6] The Swedish court refused to consider the appeal of the founder of Wikileaks 0 0.765573367056 [0, 1, 2, 3, 4, 5, 6] British police are aware of the whereabouts of the founder of WikiLeaks 3 0.779637110377 [7, 24, 25, 5, 26, 6, 27, 28] Julian Assandzh, the founder of the Wikileaks website, is arrested in the UK 8 0.810477163078 [7, 45, 36, 46, 47, 48, 17, 18, 19] Nobel Prizes will be awarded today in Stockholm and Oslo 4 0.831319718049 [29, 30, 16, 17, 18, 19] Ukraine ignores the Nobel Prize award ceremony 7 0.870710388156 [1, 24, 42, 5, 6, 43, 44, 25] British police found the founder of WikiLeaks, but did not arrest 2 0.88243190531 [16, 17, 18, 19, 20, 21, 22, 23] 19 countries are boycotting the Nobel Peace Prize award ceremony
Source: https://habr.com/ru/post/197238/
All Articles