📜 ⬆️ ⬇️

Attempt to implement universal parser online stores using SlimerJS

I want to present an example of a template parser online store. The example does not pretend to be a universal tool for obtaining structured data from an online store, but it may be suitable for some standard online stores, of which there are many on the Internet.
parsing online store
As a tool for parsing the site, I use SlimerJS.

The example tried to give as much as possible in a simplified and universal form.

So, the entry point:
The first part of this file is general logic related to the operation of SlimerJs.
script.js
var grab = require('./grab'); //   grab.create().init('/catalog'); //   

Here the parser module is connected and initialized, and the URL of the product catalog page is transferred to the init () method, the link is relative. The main domain of the site is specified in the config.js file

All parser logic is in grab.js. I divided it into two parts, the first part is a wrapper object over SlimerJS for simultaneous operation of several copies of the browser.
All comments on the code I made in the listing in order to simplify understanding of the code.
')
grab.js
 var file = require("./file").create(); //        var config = require("./config").getConfig(); //    /** *  - */ function Grab() { this.page; //    "webpage" this.current_url; //   URL this.parentCategory; //    /** *    * @param url string   ( /contacts ) * @param parent */ this.init = function(url, parent) { this.page = require("webpage").create(); //   webpage this.callbackInit(); //  callback   webpage if(url) { //   ,     config.host += url; } this.parentCategory = parent; this.open(config.host); //  URL }; /** *  URL * @param {string} url     */ this.open = function(url) { /* *      */ this.page.open(url); }; /** *      */ this.close = function() { this.page.close() }; /** *  callback */ this.callbackInit = function() { var self = this; /** *      * @param {string} message error * @param {type} stack */ this.page.onError = function (message, stack) { console.log(message); }; /** *         * @param {string} url  URL */ this.page.onUrlChanged = function (url) { self.current_url = url; //  URL   }; /** *  ,    webpage   console.log() * @param {string} message * @param {type} line * @param {type} file */ this.page.onConsoleMessage = function (message, line, file) { console.log(message); //       }; /** *       * @param {string} status    */ this.page.onLoadFinished = function(status) { if(status !== 'success') { console.log("Sorry, the page is not loaded"); self.close(); } self.route(); //      }; }; } 

The second part of the file defines the behavior and expands the created object Grab
 Grab.prototype.route = function() { try { //         if(this.isCategoryPage()) { var categories = this.getCategories(); //      file.writeJson(config.result_file, categories, 'a'); //     for (var i = 0; i < categories.length; i++) { //     var url = categories[i].url_article; //  URL       new Grab().init(url, categories[i].title); //    slimer.wait(3000); //  3 ,     } } else { //      var content = this.getContent(); //      file.writeJson(config.result_file, content, 'a'); //     this.close(); //    } this.close(); } catch(err) { console.log(err); this.close(); } }; /** *     ,    * @returns {Object} */ Grab.prototype.getCategories = function() { return this.getContent('categories') }; /** * ,       * @returns {bool} */ Grab.prototype.isCategoryPage = function() { return this.page.evaluate(function() { // ,   ,     return !$(".catalog-list .item .price").length; }); }; /** *      * @param {string} typeContent     {categories|product} * @returns {Object} */ Grab.prototype.getContent = function(typeContent) { var result = this.page.evaluate(function(typeContent) { var result = []; //  ,      (      ) $(".catalog-list .item").each(function(key, value) { var $link = $(value).find('a.name'); //   var obj = { //      'type': 'category', 'title': $link.text().trim().toLowerCase(), //   'url_article': $link.attr('href'), //        'url_article_image': $(value).find('a.img > img').attr('src') }; //    ,        if(typeContent !== 'categories') { obj.size = []; obj.type = 'product'; $('.razmers:first .pink').each(function(key, value) { // ||... obj.size.push($(value).text().trim()); }); obj.price = parseInt($(value).find('.price').text(), 10); //  } result.push(obj); }); return result; }, typeContent); return result; }; exports.create = function() { return new Grab(); }; 

For convenient work with the file system, SlimerJS provides an API that allows you to both read and write data
file.js
 var fs = require('fs'); /** *    */ function FileHelper() { /** *   * @param {string} path_to_file     * @returns array -  */ this.read = function(path_to_file) { if(!fs.isFile(path_to_file)){ throw new Error('File ('+path_to_file+') not found'); } var content = fs.read(path_to_file); if(!content.length) { throw new Error('File ('+path_to_file+') empty'); } return content.split("\n"); }; /** *     * @param {string} path_to_file     * @param {string} content    * @param {string} mode   'r', 'w', 'a/+', 'b' */ this.write = function(path_to_file, content, mode) { fs.write(path_to_file, content, mode); } /** *     JSON * @param {string} path_to_file     * @param {array} content    * @param {string} mode   'r', 'w', 'a/+', 'b' */ this.writeJson = function(path_to_file, content, mode) { var result = ''; for(var i=0; i < content.length; i++) { result += JSON.stringify(content[i]) + "\n"; } this.write(path_to_file, result, mode); } } exports.create = function() { return new FileHelper(); }; 

And the last file is a configuration file in which you can specify variables common to the whole system.
config.js
 var Config = function() { this.host = 'http://example.ru'; this.log_path = 'logs\\error.txt'; this.result_file = 'result\\result.txt'; }; exports.getConfig = function() { return new Config(); }; 


The result of the work will be in the form of a file that can be processed for further data export.
The script is launched by a command from the console.

 slimerjs script.js 

image Sources

Source: https://habr.com/ru/post/267873/


All Articles