Visualization of the photo portfolio. Part 1: Unusual deep learning

Briefly about the problem: there is a photo-portfolio with a lot of photos, we want to know the list of popular and not popular topics.

For example, consider the author's portfolio with iStock (consider only photos with age <= 2 years; about 5000 photos).

This is for those who want to pull information from iStock

The links to the photos look like this: www.istockphoto.com/ru/ru/en/photo/monitor-signal-give-gm516188033-48762778?st=5721cc0 , where the line after gm and before the hyphen is id photo (516188033).
Information about the photo (number of downloads, download date, keywords, author's name, ...) can be found at: rest.istockphoto.com/api/v1/assets/516188033
')

Sample script for downloading json pages with rest.api

import requests from bs4 import BeautifulSoup import wget import json import sys import time author_name = "urbancow" author_id = 300612 page_count = 93 path_to_save = "/home/forcesh/Downloads/Projects/iStock/urbancow/" for page in range(1, page_count): search_url = 'http://www.istockphoto.com/ru/portfolio/'+author_name data = {"facets": {"pageNumber": page, "perPage": 100, "abstractType": ["photos", "illustrations"], "order": "bestMatch", "portfolioID": [author_id], "additionalAudio": "false"}} r = requests.post(search_url, json.dumps(data)) soup = BeautifulSoup(r.text, "lxml") scope = soup.select("div.results-container") search_list = scope[0].select("section.search-list") figure_list = search_list[0].select("figure") for figure in figure_list: href = figure.select("a")[0].get("href") gm_id = href.rsplit("gm", 1)[1].split('-')[0] rest_url = "https://rest.istockphoto.com/api/v1/assets/"+gm_id wget.download(rest_url, out = path_to_save + gm_id) #iStock is sensitive to frequent requests time.sleep(0.5)

Sample script for downloading images and forming a csv file

 import json import requests import wget import time author_name = 'urbancow' portfolio_path = '/home/forcesh/Downloads/Projects/iStock/' path_to_gm_ids_list = portfolio_path + author_name + '/gm_ids_list.txt' gm_ids = open(path_to_gm_ids_list,'r') rest_info_file = open(portfolio_path + author_name + '/rest_info.csv','a') rest_info_file.write('gm_id,downloads,keywords\n') for gm_id in gm_ids: gm_id = gm_id.replace('\n','') rest_page = open(portfolio_path + author_name + '/' + gm_id,'r').read() parsed_string = json.loads(rest_page) downloads_count = parsed_string['downloadsCount'] info = gm_id + ',' info += str(downloads_count) info += ','.join( parsed_string['keywords']) rest_info_file.write(info) thumbnails_preview_url = parsed_string['thumbnails']['previewUrl'] wget.download(thumbnails_preview_url, out = portfolio_path + author_name+'/images/'+gm_id+'.jpg') print thumbnails_preview_url time.sleep(0.5)

The signs by which the data will be split will be obtained using the inceptionV3 model. But we will not use the names of the objects that the neuron finds, but the output signals from the penultimate layer (2048 signs).

An example of a script that pulls a signal from the last but one layer on Keras

 # coding: utf-8 from keras_inception_v3.inception_v3 import InceptionV3, preprocess_input from keras.preprocessing import image import numpy as np from keras_inception_v3.imagenet_utils import decode_predictions import json import os import keras from multiprocessing import Pool from multiprocessing.dummy import Pool as ThreadPool import time import h5py import keras.backend as K import tensorflow as tf import nltk from nltk.stem import WordNetLemmatizer def preprocess_image(image_path): img = image.load_img(image_path, target_size=(299, 299)) x = image.img_to_array(img) x = np.expand_dims(x, axis=0) x = preprocess_input(x) return x def preprocess_image_batch(images): batch = np.zeros((len(images), 299, 299, 3), dtype=np.float32) for i, image_path in enumerate(images): batch[i] = preprocess_image(image_path) return batch def get_descriptor(i): offset = i%num_file_items file_id = i/num_file_items f = open("descriptors/descriptors_inception_v3_pool3_%d.bin"%file_id, "rb") f.seek(8*2048*offset, os.SEEK_SET) desc = np.fromfile(f, dtype=np.double, count=2048) f.close() return desc model = InceptionV3(include_top=False, weights=None) weights_file = h5py.File('/home/snapper/.keras/models/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5') weight_value_tuples = [] for layer in model.layers: layer_name = layer.name if layer_name in weights_file and len(weights_file[layer_name]) != 0: g = weights_file[layer_name] weight_names = [n.decode('utf8') for n in g.attrs['weight_names']] weight_values = [g[weight_name] for weight_name in weight_names] symbolic_weights = layer.weights weight_value_tuples += zip(symbolic_weights, weight_values) K.batch_set_value(weight_value_tuples) batch = preprocess_image_batch(['/home/snapper/120549251.jpg']) p = model.predict(batch) images = ['/home/snapper/120549251.jpg' for i in range(100)] images = np.array(images) num_file_items = 1024*4 descriptors = np.zeros([num_file_items, 2048], dtype=np.double) step = 64 m = len(images) for i in range(0, m, step): data_end_index = min(i % num_file_items + step, i % num_file_items+min(i+step, m)-i) data_indexes = np.arange(i % num_file_items, data_end_index) batch_indexes = np.arange(i, min(i+step, m)) batch = images[batch_indexes] X = preprocess_image_batch(batch) p = model.predict(X) descriptors[data_indexes] = p if i%num_file_items+step >= num_file_items or i+step>= m: print 'saving to', 'descriptors_inception_v3_pool3_%d.bin'%(i/num_file_items) descriptors[:data_end_index].tofile('descriptors/descriptors_inception_v3_pool3_%d.bin'%(i/num_file_items)) np.allclose(get_descriptor(0), p[0])

Link to trained InceptionV3

Next, we will use t-distributed Stochastic Neighbor Embedding (t-sne) - a tool for visualizing high-dimensional data. Developers of sklearn in the documentation recommend previously using other methods to reduce the dimension (PCA for dense data or TruncatedSVD for discharged data). Compress the dimension, for example, to 50 and then use t-sne.

In our case, the data is flat, so choose TruncatedSVD and compress the dimension to 50. Then compress the data to two-dimensional space. And everything, you can draw pictures.

In order to highlight popular and non-popular topics on the constructed image, we will use gaussian_kde from sklearn (a function for estimating density). Calculate .. (mean) and .. (sd) downloads and select 4 groups of photos:

photos that have the number of downloads = 0
photos whose download count is> 0 and <= mean + sd
photos with the number of downloads> mean + sd and <= mean + 2 * sd
photos that have the number of downloads> mean + 2 * sd

Next, we will build 4 pictures for each group of photos. The gradient from blue to green and from green to red is the visualization of the density of points (photo) in this area. The closer the color is to red, the more points in a given area belong to the group in question.

Portfolio Visualization Script

 # coding: utf-8 import numpy as np import pandas as pd import sys import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt import json import os import cv2 from sklearn import manifold, decomposition from matplotlib.offsetbox import OffsetImage, AnnotationBbox from scipy.stats import gaussian_kde import gc num_file_items = 1024*4 def get_descriptor( directory, i ): offset = i%num_file_items file_id = i/num_file_items f = open( directory + "descriptors_inception_v3_pool3_%d.bin"%file_id, "rb" ) f.seek(8*2048*offset, os.SEEK_SET) desc = np.fromfile(f, dtype=np.double, count=2048) f.close() return desc def imscatter(x, y, image, ax=None, label=False): label=label==True im = OffsetImage(image) x, y = np.atleast_1d(x, y) artists = [] for x0, y0 in zip(x, y): ab = AnnotationBbox(im, (x0, y0), xycoords='data', frameon=label) artists.append(ax.add_artist(ab)) ax.update_datalim(np.column_stack([x, y])) ax.autoscale() return artists name = 'urbancow' portfolio_dir = '/home/traineeship/portfolio/' images_directory = portfolio_dir + name + '/images/' descriptors_directory = portfolio_dir + name + '/descriptors/' #order number of gm_id with open( descriptors_directory + 'descriptors_map.json' ) as data_file: descriptors_map = json.load( data_file ) #gm_id,downloads,keywords rest_info = pd.read_csv( portfolio_dir + name + '/rest_info.csv' ) gm_ids = rest_info['gm_id'] downloads = dict( zip( list( map( str, rest_info['gm_id'] ) ), rest_info['downloads'] ) ) X = np.matrix( [ get_descriptor( descriptors_directory, descriptors_map[str(gm_id)]) for gm_id in gm_ids ] ) #descriptors y = np.array( [ downloads[str(gm_id)] for gm_id in gm_ids ] ) #downloads count sd_downloads = y.std() mean_downloads = y.mean() X_list = list() y_list = list() gm_ids_list = list() part_size = 10000 if (len(X) > part_size): batches_count = len(X) / part_size + 1 index = 0 for i in range(batches_count - 1): if (i < batches_count - 2): X_list.append(X[index:index+part_size]) y_list.append(y[index:index+part_size]) gm_ids_list.append(gm_ids[index:index+part_size]) elif (i == batches_count - 2): if (len(X[index+part_size:]) < 1000): X_list.append(X[index:]) y_list.append(y[index:]) gm_ids_list.append(gm_ids[index:]) batches_count -= 1 else: X_list.append(X[index:index+part_size]) y_list.append(y[index:index+part_size]) gm_ids_list.append(gm_ids[index:index+part_size]) index += part_size X_list.append(X[index:]) y_list.append(y[index:]) gm_ids_list.append(gm_ids[index:]) index += part_size else: X_list.append(X) y_list.append(y) gm_ids_list.append(gm_ids) print len(X_list), len(X) del X del y del gm_ids del rest_info del descriptors_map del downloads gc.collect() for ii in range(len(X_list)): X = X_list[ii] y = y_list[ii] gm_ids = gm_ids_list[ii] #TruncatedSVD due to sparse data X = decomposition.TruncatedSVD(n_components=50).fit_transform(X) X = manifold.TSNE().fit_transform(X) fig, ax = plt.subplots() scale_factor=15 fig.set_size_inches(16*scale_factor, 9*scale_factor, forward=True) for i, gm_id in enumerate( gm_ids ): image_path = images_directory + str(gm_id) + '.jpg' try: image=cv2.imread(image_path) b,g,r = cv2.split(image) # get b,g,r image = cv2.merge([r,g,b]) # switch it to rgb image=cv2.resize(image, (80, 80)) except Exception as ex: size = 80, 80, 3 image = np.zeros(size, dtype=np.uint8) pass x1=X[i, 0] x2=X[i, 1] imscatter(x1, x2, image, ax) ax.plot(x1, x2) for idx in range(4): if (idx == 0): x1=X[y == 0][:,0] x2=X[y == 0][:,1] elif (idx == 1): x1=X[(y > 0) & (y <= mean_downloads + sd_downloads)][:,0] x2=X[(y > 0) & (y <= mean_downloads + sd_downloads)][:,1] elif (idx == 2): x1=X[(y > mean_downloads + sd_downloads) & (y <= mean_downloads + 2 * sd_downloads)][:,0] x2=X[(y > mean_downloads + sd_downloads) & (y <= mean_downloads + 2 * sd_downloads)][:,1] elif (idx == 3): x1=X[y > mean_downloads + 2 * sd_downloads][:,0] x2=X[y > mean_downloads + 2 * sd_downloads][:,1] xy = np.vstack([x1,x2]) kde = gaussian_kde(xy)#simple density estimation z = kde(xy) xmin, xmax = ax.get_xlim() ymin, ymax = ax.get_ylim() xedges = np.linspace(xmin, xmax, 700) yedges = np.linspace(ymin, ymax, 700) xx, yy = np.meshgrid(xedges, yedges) gridpoints = np.array([xx.ravel(), yy.ravel()]) zz = np.reshape(kde(gridpoints), xx.shape) im = ax.imshow(zz, cmap='jet', interpolation='nearest', origin='lower', extent=[xmin, xmax, ymin, ymax]) ax.grid() suffix_name = str(idx) + '_tsne_part'+str(ii)+'.png' fig.savefig('vism/'+name+'/descriptors/'+name+'_'+ suffix_name, dpi=100, bbox_inches='tight') fig.clf() ax.cla()

The following pictures were built (the size of the pictures is about 50 MB, the resolution is about 10k x 10k; GPicView opens them quite well):

It turned out that the author has the following popular topics:

photo of the streets above
places with a large gathering of people (subway, ...);
attractions
students
inside supermarkets with children

and not popular:

landscape; nature

PS Especially for those who do not want to collect data and calculate signs, but want to see how the data is visualized, spread the portfolio of another author and the calculated signs.

Source: https://habr.com/ru/post/309092/

All Articles

Visualization of the photo portfolio. Part 1: Unusual deep learning

More articles: