pandas
and requests
. The former will be used to organize the data, and the latter - for requests to the API to receive data.info
variable and then put it into the Pandas Data Frame. # import pandas as pd import requests # category = 'Ravenclaws' url = 'http://harrypotter.wikia.com/api/v1/Articles/List?expand=1&limit=1000&category=' + category requested_url = requests.get(url) json_results = requested_url.json() info = json_results['items'] ravenclaw_df = pd.DataFrame(info) print('Number of articles: {}'.format(len(info))) print('') ravenclaw_df.head()
ravenclaw_df
... only descriptions. To get the content, you have to use another kind of API request and request data based on the article ID. # houses = ['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin'] mydf = pd.DataFrame() # ID , URL for house in houses: url = "http://harrypotter.wikia.com/api/v1/Articles/List?expand=1&limit=1000&category=" + house + 's' requested_url = requests.get(url) json_results = requested_url.json() info = json_results['items'] house_df = pd.DataFrame(info) house_df = house_df[house_df['type'] == 'article'] house_df.reset_index(drop=True, inplace=True) house_df.drop(['abstract', 'comments', 'ns', 'original_dimensions', 'revision', 'thumbnail', 'type'], axis=1, inplace=True) house_df['house'] = pd.Series([house]*len(house_df)) mydf = pd.concat([mydf, house_df]) mydf.reset_index(drop=True, inplace=True) # print('Number of student articles: {}'.format(len(mydf))) print('') print(mydf.head()) print('') print(mydf.tail())
# " " # - , # text_dict = {} for iden in mydf['id']: url = 'http://harrypotter.wikia.com/api/v1/Articles/AsSimpleJson?id=' + str(iden) requested_url = requests.get(url) json_results = requested_url.json() sections = json_results['sections'] contents = [sections[i]['content'] for i, x in enumerate(sections) if sections[i]['title'] == 'Personality and traits'] if contents: paragraphs = contents[0] texts = [paragraphs[i]['text'] for i, x in enumerate(paragraphs)] all_text = ' '.join(texts) else: all_text = '' text_dict[iden] = all_text # DataFrame " " text_df = pd.DataFrame.from_dict(text_dict, orient='index') text_df.reset_index(inplace=True) text_df.columns = ['id', 'text'] text_df['text_len'] = text_df['text'].map(lambda x: len(x)) # mydf_all = pd.merge(mydf, text_df, on='id') mydf_all.sort_values('text_len', ascending=False, inplace=True) # DataFrame , " " mydf_relevant = mydf_all[mydf_all['text_len'] > 0] print('Number of useable articles: {}'.format(len(mydf_relevant))) print('') mydf_relevant.head()
trait_dict = {} trait_dict['Gryffindor'] = ['bravery', 'nerve', 'chivalry', 'daring', 'courage'] trait_dict['Slytherin'] = ['resourcefulness', 'cunning', 'ambition', 'determination', 'self-preservation', 'fraternity', 'cleverness'] trait_dict['Ravenclaw'] = ['intelligence', 'wit', 'wisdom', 'creativity', 'originality', 'individuality', 'acceptance'] trait_dict['Hufflepuff'] = ['dedication', 'diligence', 'fairness', 'patience', 'kindness', 'tolerance', 'persistence', 'loyalty']
When he was younger, Neville was clumsy, forgetful, shy, and many thought that he was ill suited to the Gryffindor faculty because he seemed shy .
Thanks to the support of friends to whom he was very devoted ; the inspiration of Professor Remus Lupine to face his fears in the third year of study; and the fact that the torturers of his parents were walking free, Neville became braver , more self-confident, and selfless in the fight against Volan de Mort and his Death Eaters.
(When he was younger, he was clumsy, forgetful, he was shy,
It’s not a bad idea. ” assured, and dedicated to the fight against Lord Voldemort and his Death Eaters.)
synsets
synonyms using the synsets
function from WordNet , the lexical database of the English language included in the nltk module (NLTK - Natural Language Toolkit). “Synset” is “synonym set,” a collection of synonyms, or “lemmas.” The synsets
function returns sets of synonyms that are associated with specific words. from nltk.corpus import wordnet as wn # foo1 = wn.synsets('bravery') print("Synonym sets associated with the word 'bravery': {}".format(foo1)) foo2 = wn.synsets('fairness') print('') print("Synonym sets associated with the word 'fairness': {}".format(foo2)) foo3 = wn.synsets('wit') print('') print("Synonym sets associated with the word 'wit': {}".format(foo3)) foo4 = wn.synsets('cunning') print('') print("Synonym sets associated with the word 'cunning': {}".format(foo4)) foo4 = wn.synsets('cunning', pos=wn.NOUN) print('') print("Synonym sets associated with the *noun* 'cunning': {}".format(foo4)) print('') # (""), synset foo_list = [foo1, foo2, foo3, foo4] for foo in foo_list: for synset in foo: print((synset.name(), synset.lemma_names()))
wn.synsets('bravery')
is associated with two sets of synonyms: one for courage.n.01
and one for fearlessness.n.01
. Let's see what this means:crafty.s.01
and clever.s.03
(adjectives). They appeared here because the word "cunning" can be both a noun and an adjective. To leave only nouns, you can specify wn.synsets('cunning', pos=wn.NOUN)
.synset
function can provide unwanted sets of synonyms. For example, paleness.n.02
(“have light skin by nature”) and comeliness.n.01
(“look good and be attractive”) are also associated with the word “fairness”. These traits are clearly not associated with Hufflepuff (although Neville Longbottom grew up handsome), so you have to manually exclude such sets from our analysis. # (), "bravery" foo1 = wn.synsets('bravery') for synset in foo1: for lemma in synset.lemmas(): print("Synset: {}; Lemma: {}; Antonyms: {}; Word Forms: {}".format(synset.name(), lemma.name(), lemma.antonyms(), lemma.derivationally_related_forms())) print("")
# , relevant_synsets = {} relevant_synsets['Ravenclaw'] = [wn.synset('intelligence.n.01'), wn.synset('wit.n.01'), wn.synset('brain.n.02'), wn.synset('wisdom.n.01'), wn.synset('wisdom.n.02'), wn.synset('wisdom.n.03'), wn.synset('wisdom.n.04'), wn.synset('creativity.n.01'), wn.synset('originality.n.01'), wn.synset('originality.n.02'), wn.synset('individuality.n.01'), wn.synset('credence.n.01'), wn.synset('acceptance.n.03')] relevant_synsets['Hufflepuff'] = [wn.synset('dedication.n.01'), wn.synset('commitment.n.04'), wn.synset('commitment.n.02'), wn.synset('diligence.n.01'), wn.synset('diligence.n.02'), wn.synset('application.n.06'), wn.synset('fairness.n.01'), wn.synset('fairness.n.01'), wn.synset('patience.n.01'), wn.synset('kindness.n.01'), wn.synset('forgivingness.n.01'), wn.synset('kindness.n.03'), wn.synset('tolerance.n.03'), wn.synset('tolerance.n.04'), wn.synset('doggedness.n.01'), wn.synset('loyalty.n.01'), wn.synset('loyalty.n.02')] relevant_synsets['Gryffindor'] = [wn.synset('courage.n.01'), wn.synset('fearlessness.n.01'), wn.synset('heart.n.03'), wn.synset('boldness.n.02'), wn.synset('chivalry.n.01'), wn.synset('boldness.n.01')] relevant_synsets['Slytherin'] = [wn.synset('resourcefulness.n.01'), wn.synset('resource.n.03'), wn.synset('craft.n.05'), wn.synset('cunning.n.02'), wn.synset('ambition.n.01'), wn.synset('ambition.n.02'), wn.synset('determination.n.02'), wn.synset('determination.n.04'), wn.synset('self-preservation.n.01'), wn.synset('brotherhood.n.02'), wn.synset('inventiveness.n.01'), wn.synset('brightness.n.02'), wn.synset('ingenuity.n.02')] # , def get_forms(lemma): drfs = lemma.derivationally_related_forms() output_list = [] if drfs: for drf in drfs: drf_pos = str(drf).split(".")[1] if drf_pos in ['n', 's', 'a']: output_list.append(drf.name().lower()) if drf_pos in ['s', 'a']: # + "-ness" + & if len(drf.name()) == 3: last_letter = drf.name()[-1:] output_list.append(drf.name().lower() + last_letter + 'er') output_list.append(drf.name().lower() + last_letter + 'est') output_list.append(drf.name().lower()+'ness') output_list.append(drf.name().lower()+'ly') elif drf.name()[-4:] in ['able', 'ible']: output_list.append(drf.name().lower()+'r') output_list.append(drf.name().lower()+'st') output_list.append(drf.name().lower()+'ness') output_list.append(drf.name()[:-1].lower()+'y') elif drf.name()[-1:] == 'e': output_list.append(drf.name().lower()+'r') output_list.append(drf.name().lower()+'st') output_list.append(drf.name().lower()+'ness') output_list.append(drf.name().lower()+'ly') elif drf.name()[-2:] == 'ic': output_list.append(drf.name().lower()+'er') output_list.append(drf.name().lower()+'est') output_list.append(drf.name().lower()+'ness') output_list.append(drf.name().lower()+'ally') elif drf.name()[-1:] == 'y': output_list.append(drf.name()[:-1].lower()+'ier') output_list.append(drf.name()[:-1].lower()+'iest') output_list.append(drf.name()[:-1].lower()+'iness') output_list.append(drf.name()[:-1].lower()+'ily') else: output_list.append(drf.name().lower()+'er') output_list.append(drf.name().lower()+'est') output_list.append(drf.name().lower()+'ness') output_list.append(drf.name().lower()+'ly') return output_list else: return output_list # # , , , import copy new_trait_dict = copy.deepcopy(trait_dict) antonym_dict = {} # () ; ( ) for house, traits in trait_dict.items(): antonym_dict[house] = [] for trait in traits: synsets = wn.synsets(trait, pos=wn.NOUN) for synset in synsets: if synset in relevant_synsets[house]: for lemma in synset.lemmas(): new_trait_dict[house].append(lemma.name().lower()) if get_forms(lemma): new_trait_dict[house].extend(get_forms(lemma)) if lemma.antonyms(): for ant in lemma.antonyms(): antonym_dict[house].append(ant.name().lower()) if get_forms(ant): antonym_dict[house].extend(get_forms(ant)) new_trait_dict[house] = sorted(list(set(new_trait_dict[house]))) antonym_dict[house] = sorted(list(set(antonym_dict[house]))) # print("Gryffindor traits: {}".format(new_trait_dict['Gryffindor'])) print("") print("Gryffindor anti-traits: {}".format(antonym_dict['Gryffindor'])) print("")
# , from itertools import combinations def test_overlap(dict): results = [] house_combos = combinations(list(dict.keys()), 2) for combo in house_combos: results.append(set(dict[combo[0]]).isdisjoint(dict[combo[1]])) return results # ; "False" print("Any words overlap in trait dictionary? {}".format(sum(test_overlap(new_trait_dict)) != 6)) print("Any words overlap in antonym dictionary? {}".format(sum(test_overlap(antonym_dict)) != 6))
# "word_tokenize", from nltk import word_tokenize # , def sort_student(text): text_list = word_tokenize(text) text_list = [word.lower() for word in text_list] score_dict = {} houses = ['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin'] for house in houses: score_dict[house] = (sum([True for word in text_list if word in new_trait_dict[house]]) - sum([True for word in text_list if word in antonym_dict[house]])) sorted_house = max(score_dict, key=score_dict.get) sorted_house_score = score_dict[sorted_house] if sum([True for i in score_dict.values() if i==sorted_house_score]) == 1: return sorted_house else: return "Tie!" # print(sort_student('Alice was brave')) print(sort_student('Alice was British'))
# pd.options.mode.chained_assignment = None mydf_relevant['new_house'] = mydf_relevant['text'].map(lambda x: sort_student(x)) mydf_relevant.head(20)
print("Match rate: {}".format(sum(mydf_relevant['house'] == mydf_relevant['new_house']) / len(mydf_relevant))) print("Percentage of ties: {}".format(sum(mydf_relevant['new_house'] == 'Tie!') / len(mydf_relevant)))
# -- tom_riddle = word_tokenize(mydf_relevant['text'].values[0]) tom_riddle = [word.lower() for word in tom_riddle] # , words_dict = {} anti_dict = {} houses = ['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin'] for house in houses: words_dict[house] = [word for word in tom_riddle if word in new_trait_dict[house]] anti_dict[house] = [word for word in tom_riddle if word in antonym_dict[house]] print(words_dict) print("") print(anti_dict)
Source: https://habr.com/ru/post/331352/
All Articles