tag_var_re = re.compile(r'data-([az-]+)\((.*?)\)|(\S+)') def make_sample(rs, cls, *args, **kwargs): tokens = [cls] + list(args) for k, v in kwargs.items(): tokens.append(k) tokens.append(v) result = rs.reply('', ' '.join(map(str, tokens))).strip() if result == '[ERR: No Reply Matched]': raise Exception("failed to generate string for {}".format(tokens)) cmd, en, tags = [cls], [], [] for tag, value, just_word in tag_var_re.findall(result): if just_word: en.append(just_word) tags.append('O') else: _, tag = tag.split('-', maxsplit=1) words = value.split() en.append(words.pop(0)) tags.append('B-'+tag) for word in words: en.append(word) tags.append('I-'+tag) cmd.append(tag+':') cmd.append('"'+value+'"') return cmd, en, tags
rs = RiveScript(utf8=True) rs.load_directory(os.path.join(this_dir, 'human_train_1')) rs.sort_replies() for c in ('yes', 'no', 'ping'): for _ in range(COUNT): add_sample(make_sample(rs, c)) to_remind = ['wash hands', 'read books', 'make tea', 'pay bills', 'eat food', 'buy stuff', 'take a walk', 'do maki-uchi', 'say hello', 'say yes', 'say no', 'play games'] for _ in range(COUNT): r = random.choice(to_remind) add_sample(make_sample(rs, 'remind', r))
+ hello - hello - hey - hi + ping - {@hello}{random}|, sweetie{/random} - {@hello} there - {random}are |{/random}you {random}here|there{/random}? - ping - yo + yes - yes - yep - yeah + no - no - not yet - nope
+ remind * @ maybe-please remind-without-please data-remind-action(<star>) + remind-without-please * - remind me to <star> - remind me data-remind-when({@when}) to <star> - remind me to <star> data-remind-when({@when}) + when - today - later - tomorrow + maybe-please * - <@> {weight=3} - please, <@> - <@>, please
remind do maki-uchi
please, remind me data-remind-when(tomorrow) to data-remind-action(do maki-uchi)
please, remind me tomorrow to do maki-uchi
remind when: "tomorrow" what: "do maki-uchi"
OOO B-when O B-action I-action
def _embed(sentence): return one_hot(sentence, HASH_SIZE) def _make_classifier(input_length, vocab_size, class_count): result = Sequential() result.add(Embedding(vocab_size, 8, input_length=input_length)) result.add(Flatten()) result.add(Dense(class_count, activation='sigmoid')) result.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) return result def _train(model, prep_func, train, validation=None, epochs=10, verbose=2): X, y = prep_func(*train) validation_data = None if validation is None else prep_func(*validation) model.fit(X, y, epochs=epochs, verbose=verbose, shuffle=False, validation_data=validation_data) class Translator: def __init__(self, class_count=None, cls=None, lb=None): if class_count is None and lb is None and cls is None: raise Exception("Class count is not known") self.max_length = 32 self.lb = lb or LabelBinarizer() if class_count is None and lb is not None: class_count = len(lb.classes_) self.classifier = cls or _make_classifier(self.max_length, HASH_SIZE, class_count) def _prepare_classifier_data(self, lines, labels): X = pad_sequences([_embed(line) for line in lines], padding='post', maxlen=self.max_length) y = self.lb.transform(labels) return X, y def train_classifier(self, lines, labels, validation=None): _train(self.classifier, self._prepare_classifier_data, (lines, labels), validation) def classifier_eval(self, lines, labels): X = pad_sequences([_embed(line) for line in lines], padding='post', maxlen=self.max_length) y = self.lb.transform(labels) loss, accuracy = self.classifier.evaluate(X, y) print(loss, accuracy*100) def classify(self, line): res = self._classifier_predict(line) if max(res[0]) > 0.1: return self.lb.inverse_transform(res)[0] else: return 'unknown' def classify2(self, line): res = self._classifier_predict(line) print('\n'.join(map(str, zip(self.lb.classes_, res[0])))) m = max(res[0]) c = self.lb.inverse_transform(res)[0] if m > 0.05: return c elif m > 0.02: return 'probably ' + c else: return 'unknown ' + c + '? ' + str(m)
def load_sentences(file_name): with open(file_name) as fen: return [l.strip() for l in fen.readlines()] def load_labels(file_name): with open(file_name) as fpa: return [line.strip().split(maxsplit=1)[0] for line in fpa]
sentences = load_sentences(os.path.join(data_dir, "train.en")) labels = load_labels(os.path.join(data_dir, "train.pa")) tags = load_sentences(os.path.join(data_dir, "train.tg")) label_count = len(set(labels)) translator = Translator(label_count) translator.lb.fit(labels) translator.train_classifier(sentences, labels)
classifier = model_from_json(os.path.join(data_dir, "trained.cls")) with open(os.path.join(data_dir, "trained.lb"), 'rb') as labels_file: lb = pickle.load(labels_file) translator = Translator(lb=lb, cls=classifier, tagger=tagger) line = ' '.join(sys.argv) print(translator.classify2(line))
one_hotHere, it would seem, nothing hints.
One-hot encodes a text into a vocabulary of size n.
This is a wrapper function.
hashing_trickIt seems too nothing. But if you still look below at the list of arguments ...
Converts in a fixed-size hashing space
hash_function : defaults to python hash
function, can be 'md5' or any function. It is not a consistent hashing function.
I changed one_hot to hashing_trick from md5, but the result did not change, I received the same 25% correct answers. Using one_hot was certainly a mistake, but not the only one.
$ ./translate4.py 'please, remind me to make some tea'
probably remind
Source: https://habr.com/ru/post/348224/