titles =[ " WikiLeaks", " , ", " 19 ", " Wikileaks ", " ", " Wikileaks", " ", " WikiLeaks, , ", " " ]
numpy
and scipy
bring the words to their original form using nltk. Installing ...pip install numpy
pip install nltk
pip install scipy
scipy
encounter any problems (it will require to install a BLASS), it will probably help.apt-get install gfortran libopenblas-dev liblapack-dev
class LSI(object): def __init__(self, stopwords, ignorechars, docs): # , self.wdict = {} # dictionary - self.dictionary = [] # , , self.stopwords = stopwords if type(ignorechars) == unicode: ignorechars = ignorechars.encode('utf-8') self.ignorechars = ignorechars # for doc in docs: self.add_doc(doc)
def dic(self, word, add = False): if type(word) == unicode: word = word.encode('utf-8') # word = word.lower().translate(None, self.ignorechars) word = word.decode('utf-8') # word = stemmer.stem(word) # if word in self.dictionary: return self.dictionary.index(word) else: # if add: #self.ready = False self.dictionary.append(word) return len(self.dictionary) - 1 else: return None
def build(self): # self.keys = [k for k in self.wdict.keys() if len(self.wdict[k]) > 0] self.keys.sort() # self.A = zeros([len(self.keys), len(self.docs)]) # for i, k in enumerate(self.keys): for d in self.wdict[k]: self.A[i,d] += 1
def calc(self): """ U, S Vt - """ self.U, self.S, self.Vt = svd(self.A)
def TFIDF(self): # - wordsPerDoc = sum(self.A, axis=0) # docsPerWord = sum(asarray(self.A > 0, 'i'), axis=1) rows, cols = self.A.shape for i in range(rows): for j in range(cols): self.A[i,j] = (self.A[i,j] / wordsPerDoc[j]) * log(float(cols) / docsPerWord[i])
def find(self, word): self.prepare() idx = self.dic(word) if not idx: print ' ' return [] if not idx in self.keys: print ' stopwords' return [] idx = self.keys.index(idx) print 'word --- ', word, '=', self.dictionary[self.keys[idx]], '.\n' # wx, wy = (-1 * self.U[:, 1:3])[idx] print 'word {}\t{:0.2f}\t{:0.2f}\t{}\n'.format(idx, wx, wy, word) arts = [] xx, yy = -1 * self.Vt[1:3, :] for k, v in enumerate(self.docs): # ax, ay = xx[k], yy[k] # dx, dy = float(wx - ax), float(wy - ay) arts.append((k, v, ax, ay, sqrt(dx * dx + dy * dy))) # return sorted(arts, key = lambda a: a[4])
class LSI(object): def __init__(self, stopwords, ignorechars, docs): self.wdict = {} self.dictionary = [] self.stopwords = stopwords if type(ignorechars) == unicode: ignorechars = ignorechars.encode('utf-8') self.ignorechars = ignorechars for doc in docs: self.add_doc(doc) def prepare(self): self.build() self.calc() def dic(self, word, add = False): if type(word) == unicode: word = word.encode('utf-8') word = word.lower().translate(None, self.ignorechars) word = word.decode('utf-8') word = stemmer.stem(word) if word in self.dictionary: return self.dictionary.index(word) else: if add: self.dictionary.append(word) return len(self.dictionary) - 1 else: return None def add_doc(self, doc): words = [self.dic(word, True) for word in doc.lower().split()] self.docs.append(words) for word in words: if word in self.stopwords: continue elif word in self.wdict: self.wdict[word].append(len(self.docs) - 1) else: self.wdict[word] = [len(self.docs) - 1] def build(self): self.keys = [k for k in self.wdict.keys() if len(self.wdict[k]) > 0] self.keys.sort() self.A = zeros([len(self.keys), len(self.docs)]) for i, k in enumerate(self.keys): for d in self.wdict[k]: self.A[i,d] += 1 def calc(self): self.U, self.S, self.Vt = svd(self.A) def TFIDF(self): wordsPerDoc = sum(self.A, axis=0) docsPerWord = sum(asarray(self.A > 0, 'i'), axis=1) rows, cols = self.A.shape for i in range(rows): for j in range(cols): self.A[i,j] = (self.A[i,j] / wordsPerDoc[j]) * log(float(cols) / docsPerWord[i]) def dump_src(self): self.prepare() print ' ' for i, row in enumerate(self.A): print self.dictionary[i], row def print_svd(self): self.prepare() print ' ' print self.S print ' 3 U ' for i, row in enumerate(self.U): print self.dictionary[self.keys[i]], row[0:3] print ' 3 Vt ' print -1*self.Vt[0:3, :] def find(self, word): self.prepare() idx = self.dic(word) if not idx: print ' ' return [] if not idx in self.keys: print ' stopwords' return [] idx = self.keys.index(idx) print 'word --- ', word, '=', self.dictionary[self.keys[idx]], '.\n' # wx, wy = (-1 * self.U[:, 1:3])[idx] print 'word {}\t{:0.2f}\t{:0.2f}\t{}\n'.format(idx, wx, wy, word) arts = [] xx, yy = -1 * self.Vt[1:3, :] for k, v in enumerate(self.docs): ax, ay = xx[k], yy[k] dx, dy = float(wx - ax), float(wy - ay) arts.append((k, v, ax, ay, sqrt(dx * dx + dy * dy))) return sorted(arts, key = lambda a: a[4])
docs =[ " WikiLeaks", " , ", " 19 ", " Wikileaks ", " ", " Wikileaks", " ", " WikiLeaks, , ", " " ] ignorechars = ''',:'!''' word = "" lsa = LSI([], ignorechars, docs) lsa.build() lsa.dump_src() lsa.calc() lsa.print_svd() for res in lsa.find(word): print res[0], res[4], res[1], docs[res[0]]
lsa.dump_src () British [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] polits [1. 0. 0. 0. 0. 0. 0. 0. 1. 0.] knows [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] ...
lsa.print_svd () here are the first 3 columns of the u matrix British [-0.06333698 -0.08969849 0.03023127] police [-0.14969793 -0.20853416 0.07106177] knows [-0.06333698 -0.08969849 0.03023127] ... Here are the first 3 lines of the Vt matrix [[0.25550481 0.47069418 0.27633104 0.39579252 0.21466192 0.26635401 0.32757769 0.3483847 0.3666749] [0.34469126 -0.18334417 -0.36995197 0.37444485 -0.29101203 0.27916372 -0.26791709 0.45665895 -0.35715836] [-0.10950444 0.64280654 -0.39672464 -0.1011325 -0.36012511 -0.01213328 0.38644373 -0.14789727 -0.32579232]]
for res in lsa.find (word): print res [0], res [4], res [1], docs [res [0]] word 9 (word code in the dictionary) -0.17 (first coordinate of the word) 0.46 (second coordinate) United States (the word itself) document number in list | distance | document decomposed on codes | the document itself 6 0.127328977215 [35, 36, 9, 37, 38, 39, 23, 40, 12, 41] NATO and the United States developed plans for the defense of the Baltic States against Russia 1 0.182108022464 [7, 8, 9, 9, 10, 11, 12, 13, 14, 15] The trial against the Russian who sent out spam begins in the US court 5 0.649492914495 [31, 8, 32, 33, 34, 5, 6] The Swedish court refused to consider the appeal of the founder of Wikileaks 0 0.765573367056 [0, 1, 2, 3, 4, 5, 6] British police are aware of the whereabouts of the founder of WikiLeaks 3 0.779637110377 [7, 24, 25, 5, 26, 6, 27, 28] Julian Assandzh, the founder of the Wikileaks website, is arrested in the UK 8 0.810477163078 [7, 45, 36, 46, 47, 48, 17, 18, 19] Nobel Prizes will be awarded today in Stockholm and Oslo 4 0.831319718049 [29, 30, 16, 17, 18, 19] Ukraine ignores the Nobel Prize award ceremony 7 0.870710388156 [1, 24, 42, 5, 6, 43, 44, 25] British police found the founder of WikiLeaks, but did not arrest 2 0.88243190531 [16, 17, 18, 19, 20, 21, 22, 23] 19 countries are boycotting the Nobel Peace Prize award ceremony
Source: https://habr.com/ru/post/197238/
All Articles