import requests from bs4 import BeautifulSoup import wget import json import sys import time author_name = "urbancow" author_id = 300612 page_count = 93 path_to_save = "/home/forcesh/Downloads/Projects/iStock/urbancow/" for page in range(1, page_count): search_url = 'http://www.istockphoto.com/ru/portfolio/'+author_name data = {"facets": {"pageNumber": page, "perPage": 100, "abstractType": ["photos", "illustrations"], "order": "bestMatch", "portfolioID": [author_id], "additionalAudio": "false"}} r = requests.post(search_url, json.dumps(data)) soup = BeautifulSoup(r.text, "lxml") scope = soup.select("div.results-container") search_list = scope[0].select("section.search-list") figure_list = search_list[0].select("figure") for figure in figure_list: href = figure.select("a")[0].get("href") gm_id = href.rsplit("gm", 1)[1].split('-')[0] rest_url = "https://rest.istockphoto.com/api/v1/assets/"+gm_id wget.download(rest_url, out = path_to_save + gm_id) #iStock is sensitive to frequent requests time.sleep(0.5)
import json import requests import wget import time author_name = 'urbancow' portfolio_path = '/home/forcesh/Downloads/Projects/iStock/' path_to_gm_ids_list = portfolio_path + author_name + '/gm_ids_list.txt' gm_ids = open(path_to_gm_ids_list,'r') rest_info_file = open(portfolio_path + author_name + '/rest_info.csv','a') rest_info_file.write('gm_id,downloads,keywords\n') for gm_id in gm_ids: gm_id = gm_id.replace('\n','') rest_page = open(portfolio_path + author_name + '/' + gm_id,'r').read() parsed_string = json.loads(rest_page) downloads_count = parsed_string['downloadsCount'] info = gm_id + ',' info += str(downloads_count) info += ','.join( parsed_string['keywords']) rest_info_file.write(info) thumbnails_preview_url = parsed_string['thumbnails']['previewUrl'] wget.download(thumbnails_preview_url, out = portfolio_path + author_name+'/images/'+gm_id+'.jpg') print thumbnails_preview_url time.sleep(0.5)
# coding: utf-8 from keras_inception_v3.inception_v3 import InceptionV3, preprocess_input from keras.preprocessing import image import numpy as np from keras_inception_v3.imagenet_utils import decode_predictions import json import os import keras from multiprocessing import Pool from multiprocessing.dummy import Pool as ThreadPool import time import h5py import keras.backend as K import tensorflow as tf import nltk from nltk.stem import WordNetLemmatizer def preprocess_image(image_path): img = image.load_img(image_path, target_size=(299, 299)) x = image.img_to_array(img) x = np.expand_dims(x, axis=0) x = preprocess_input(x) return x def preprocess_image_batch(images): batch = np.zeros((len(images), 299, 299, 3), dtype=np.float32) for i, image_path in enumerate(images): batch[i] = preprocess_image(image_path) return batch def get_descriptor(i): offset = i%num_file_items file_id = i/num_file_items f = open("descriptors/descriptors_inception_v3_pool3_%d.bin"%file_id, "rb") f.seek(8*2048*offset, os.SEEK_SET) desc = np.fromfile(f, dtype=np.double, count=2048) f.close() return desc model = InceptionV3(include_top=False, weights=None) weights_file = h5py.File('/home/snapper/.keras/models/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5') weight_value_tuples = [] for layer in model.layers: layer_name = layer.name if layer_name in weights_file and len(weights_file[layer_name]) != 0: g = weights_file[layer_name] weight_names = [n.decode('utf8') for n in g.attrs['weight_names']] weight_values = [g[weight_name] for weight_name in weight_names] symbolic_weights = layer.weights weight_value_tuples += zip(symbolic_weights, weight_values) K.batch_set_value(weight_value_tuples) batch = preprocess_image_batch(['/home/snapper/120549251.jpg']) p = model.predict(batch) images = ['/home/snapper/120549251.jpg' for i in range(100)] images = np.array(images) num_file_items = 1024*4 descriptors = np.zeros([num_file_items, 2048], dtype=np.double) step = 64 m = len(images) for i in range(0, m, step): data_end_index = min(i % num_file_items + step, i % num_file_items+min(i+step, m)-i) data_indexes = np.arange(i % num_file_items, data_end_index) batch_indexes = np.arange(i, min(i+step, m)) batch = images[batch_indexes] X = preprocess_image_batch(batch) p = model.predict(X) descriptors[data_indexes] = p if i%num_file_items+step >= num_file_items or i+step>= m: print 'saving to', 'descriptors_inception_v3_pool3_%d.bin'%(i/num_file_items) descriptors[:data_end_index].tofile('descriptors/descriptors_inception_v3_pool3_%d.bin'%(i/num_file_items)) np.allclose(get_descriptor(0), p[0])
# coding: utf-8 import numpy as np import pandas as pd import sys import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt import json import os import cv2 from sklearn import manifold, decomposition from matplotlib.offsetbox import OffsetImage, AnnotationBbox from scipy.stats import gaussian_kde import gc num_file_items = 1024*4 def get_descriptor( directory, i ): offset = i%num_file_items file_id = i/num_file_items f = open( directory + "descriptors_inception_v3_pool3_%d.bin"%file_id, "rb" ) f.seek(8*2048*offset, os.SEEK_SET) desc = np.fromfile(f, dtype=np.double, count=2048) f.close() return desc def imscatter(x, y, image, ax=None, label=False): label=label==True im = OffsetImage(image) x, y = np.atleast_1d(x, y) artists = [] for x0, y0 in zip(x, y): ab = AnnotationBbox(im, (x0, y0), xycoords='data', frameon=label) artists.append(ax.add_artist(ab)) ax.update_datalim(np.column_stack([x, y])) ax.autoscale() return artists name = 'urbancow' portfolio_dir = '/home/traineeship/portfolio/' images_directory = portfolio_dir + name + '/images/' descriptors_directory = portfolio_dir + name + '/descriptors/' #order number of gm_id with open( descriptors_directory + 'descriptors_map.json' ) as data_file: descriptors_map = json.load( data_file ) #gm_id,downloads,keywords rest_info = pd.read_csv( portfolio_dir + name + '/rest_info.csv' ) gm_ids = rest_info['gm_id'] downloads = dict( zip( list( map( str, rest_info['gm_id'] ) ), rest_info['downloads'] ) ) X = np.matrix( [ get_descriptor( descriptors_directory, descriptors_map[str(gm_id)]) for gm_id in gm_ids ] ) #descriptors y = np.array( [ downloads[str(gm_id)] for gm_id in gm_ids ] ) #downloads count sd_downloads = y.std() mean_downloads = y.mean() X_list = list() y_list = list() gm_ids_list = list() part_size = 10000 if (len(X) > part_size): batches_count = len(X) / part_size + 1 index = 0 for i in range(batches_count - 1): if (i < batches_count - 2): X_list.append(X[index:index+part_size]) y_list.append(y[index:index+part_size]) gm_ids_list.append(gm_ids[index:index+part_size]) elif (i == batches_count - 2): if (len(X[index+part_size:]) < 1000): X_list.append(X[index:]) y_list.append(y[index:]) gm_ids_list.append(gm_ids[index:]) batches_count -= 1 else: X_list.append(X[index:index+part_size]) y_list.append(y[index:index+part_size]) gm_ids_list.append(gm_ids[index:index+part_size]) index += part_size X_list.append(X[index:]) y_list.append(y[index:]) gm_ids_list.append(gm_ids[index:]) index += part_size else: X_list.append(X) y_list.append(y) gm_ids_list.append(gm_ids) print len(X_list), len(X) del X del y del gm_ids del rest_info del descriptors_map del downloads gc.collect() for ii in range(len(X_list)): X = X_list[ii] y = y_list[ii] gm_ids = gm_ids_list[ii] #TruncatedSVD due to sparse data X = decomposition.TruncatedSVD(n_components=50).fit_transform(X) X = manifold.TSNE().fit_transform(X) fig, ax = plt.subplots() scale_factor=15 fig.set_size_inches(16*scale_factor, 9*scale_factor, forward=True) for i, gm_id in enumerate( gm_ids ): image_path = images_directory + str(gm_id) + '.jpg' try: image=cv2.imread(image_path) b,g,r = cv2.split(image) # get b,g,r image = cv2.merge([r,g,b]) # switch it to rgb image=cv2.resize(image, (80, 80)) except Exception as ex: size = 80, 80, 3 image = np.zeros(size, dtype=np.uint8) pass x1=X[i, 0] x2=X[i, 1] imscatter(x1, x2, image, ax) ax.plot(x1, x2) for idx in range(4): if (idx == 0): x1=X[y == 0][:,0] x2=X[y == 0][:,1] elif (idx == 1): x1=X[(y > 0) & (y <= mean_downloads + sd_downloads)][:,0] x2=X[(y > 0) & (y <= mean_downloads + sd_downloads)][:,1] elif (idx == 2): x1=X[(y > mean_downloads + sd_downloads) & (y <= mean_downloads + 2 * sd_downloads)][:,0] x2=X[(y > mean_downloads + sd_downloads) & (y <= mean_downloads + 2 * sd_downloads)][:,1] elif (idx == 3): x1=X[y > mean_downloads + 2 * sd_downloads][:,0] x2=X[y > mean_downloads + 2 * sd_downloads][:,1] xy = np.vstack([x1,x2]) kde = gaussian_kde(xy)#simple density estimation z = kde(xy) xmin, xmax = ax.get_xlim() ymin, ymax = ax.get_ylim() xedges = np.linspace(xmin, xmax, 700) yedges = np.linspace(ymin, ymax, 700) xx, yy = np.meshgrid(xedges, yedges) gridpoints = np.array([xx.ravel(), yy.ravel()]) zz = np.reshape(kde(gridpoints), xx.shape) im = ax.imshow(zz, cmap='jet', interpolation='nearest', origin='lower', extent=[xmin, xmax, ymin, ymax]) ax.grid() suffix_name = str(idx) + '_tsne_part'+str(ii)+'.png' fig.savefig('vism/'+name+'/descriptors/'+name+'_'+ suffix_name, dpi=100, bbox_inches='tight') fig.clf() ax.cla()
Source: https://habr.com/ru/post/309092/
All Articles