CREATE TABLE panda.competitors ( id int(11) NOT NULL AUTO_INCREMENT, name varchar(50) DEFAULT NULL, PRIMARY KEY (id) ) ENGINE = INNODB CHARACTER SET utf8 COLLATE utf8_general_ci;
CREATE TABLE panda.competitors_selector ( id int(11) NOT NULL AUTO_INCREMENT, competitor_id int(11) DEFAULT NULL, selector varchar(255) DEFAULT NULL, use_status CHAR(20) DEFAULT 'unused', PRIMARY KEY (id) ) ENGINE = INNODB CHARACTER SET utf8 COLLATE utf8_general_ci;
CREATE TABLE panda.competitors_data ( id INT(11) NOT NULL AUTO_INCREMENT, competitor_id INT(11) DEFAULT NULL, sku INT(11) DEFAULT NULL, competitor_url VARCHAR(255) DEFAULT NULL, competitor_price INT(11) DEFAULT NULL, competitor_response_ms INT(11) DEFAULT NULL, competitor_price_status CHAR(20) DEFAULT 'created', last_update_unixtime INT(11) DEFAULT NULL, PRIMARY KEY (id) ) ENGINE = INNODB CHARACTER SET utf8 COLLATE utf8_general_ci;
UPDATE competitors_selector SET use_status = 'unuse' WHERE 1;
SELECT * FROM competitors_selector
SELECT * FROM competitors_data
UPDATE competitors_data SET competitor_price = '{{price}}' ,competitor_response_ms = {{response_ms}} ,last_update_unixtime = {{response_time}} ,competitor_price_status = '{{status}}' WHERE id = {{id}}
{:['selector1','selector2']}
var needle = require('needle'); var mysql = require('mysql'); var templayed = require('templayed'); var fs = require('fs'); var schema = require("semantic-schema-parser"); var Scraping = new function(){ //1. _this = this; _this.DB = mysql.createConnection({ host : 'localhost', user : '******************', password : '******************', database : 'panda' }); _this.querys = {}; _this.selectors = {};//dom Scraping'a _this.rows = null;// url _this.workers = 5;// _this.needleOptions = { follow_max : 5 // 5 ,headers:{'User-Agent': 'Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots) '} ,open_timeout: 2000 // if we don't get our response headers in 5 seconds, boom. } this.run = function(){ _this.DB.connect(); _this.queryLoad(); _this.resetSelectorsStatus(); } //--------------------// // { ()} this.queryLoad = function(){ _this.querys['selectcSelectors'] = fs.readFileSync('sql/selectcSelectors.sql', 'utf8'); _this.querys['resetSelectorsStatus'] = fs.readFileSync('sql/resetSelectorsStatus.sql', 'utf8'); _this.querys['selectcURLs'] = fs.readFileSync('sql/selectcURLs.sql', 'utf8'); _this.querys['updateCompetitorData'] = fs.readFileSync('sql/updateCompetitorData.sql', 'utf8'); }; //--------------------// // this.resetSelectorsStatus = function(callback){ _this.DB.query( _this.querys['resetSelectorsStatus'], function(err, rows) { _this.selectcSelectors(); }); } //--------------------// //---- DOM this.selectcSelectors = function(callback){ _this.DB.query( _this.querys['selectcSelectors'] , function(err, rows) { if (!err){ rows.forEach(function(item, i, arr) { if (!_this.selectors[item.competitor_id]){_this.selectors[item.competitor_id]=[];} _this.selectors[item.competitor_id].push({id:item.id,selector:item.selector}); }); _this.selectcURLs(); } }); } //--------------------// this.selectcURLs = function(callback){ _this.DB.query( _this.querys['selectcURLs'] , function(err, rows) { if (!err){ _this.rows = rows; _this.workersTask(); } }); } //--------------------// this.workersTask = function(){ __this = this; __this.worker = []; for (var i = 0; i < _this.workers; i++) { __this.worker[i] = { id:i ,status: 'run' ,allTask : function(){ var status = {}; status['run'] = 0; status['iddle'] = 0; __this.worker.forEach(function(item){ if(item.status==='run'){status['run']++;} else{status['iddle']++;} }); return status; } ,timeStart : new Date().valueOf() ,func: function(){ _this.parseData(__this.worker[this.id]) } } __this.worker[i].func(); } } //--------------------// this.parseData = function(worker){ __this = this; var count = _this.rows.length; var startTime = new Date().valueOf(); if(count > 0 ){ var row = _this.rows.shift();// rows var URL = row.competitor_url; needle.get(URL, _this.needleOptions, function(err, res){ if (err) {worker.func(); return;} var timeRequest = ( new Date().valueOf() ) - startTime; schema.parseContent(res.body, function(schemaObject, $){ price = _this.parseBody(schemaObject,$,row); status = ( price!='NULL' ) ? 'active' : 'error'; //console.log(res.statusCode, 'timeout:'+timeRequest, 'price:'+price, worker.id, URL, row.id); _this.updateCompetitorData({ id : row.id ,status:status ,response_time:( new Date().valueOf() ) ,response_ms:timeRequest ,price:price }); worker.func(); }); }); }else{ worker.status = 'idle'; if(worker.allTask().run===0){ console.log(' , '); _this.DB.end();// ; } } } _this.parseBody = function(schemaObject,$,row){ var price; schemaObject.elems.forEach(function(elemsItem, i) { if(!elemsItem.product)return; elemsItem.product.forEach(function(productItem, i) { if(!productItem.price)return; price = ( productItem.price['content'] ) ? productItem.price['content'] : ( productItem.price['text'] ) ? productItem.price['text'] : null; price = price.replace(/[^.\d]+/g,"").replace( /^([^\.]*\.)|\./g, '$1' ); price = Math.floor(price); }); }); if( (!price) && (_this.selectors[row.competitor_id]) ){ _this.selectors[row.competitor_id].forEach(function(selector){ price_text = $(selector.selector).text(); if (price_text){ price = price_text.replace(/[^.\d]+/g,"").replace( /^([^\.]*\.)|\./g, '$1' ); price = Math.floor(price); // selector //-------------------------------------------------// return; } }); } if(!price){price = 'NULL'}; return price; } //--------------------// _this.updateCompetitorData = function(data){ query = templayed( _this.querys['updateCompetitorData'] )(data); _this.DB.query( query , function(err, rows) {}); } //--------------------// _this.run(); }
<div class="price price_break"><ins class="num">2980</ins><ins class="rub"> .</ins></div>
Source: https://habr.com/ru/post/328734/
All Articles