MyDataProvider » Blog » Nordstorm scraper C# code sample

Nordstorm scraper C# code sample

  • by

Nordstorm web scraper project, find below sample code for this scraper:

using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using CatalogLoader;
using CatalogLoaderCommon;
using HtmlAgilityPack;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using RestSharp;
using RestSharp.Extensions;


#if NET462
using Microsoft.CSharp; // for dynamics
#endif

//using NPOI.SS.Util;
//css_reference NPOI.dll
//css_reference NPOI.OOXML.dll
//css_reference NPOI.OpenXml4Net.dll
//css_reference NPOI.OpenXmlFormats.dll
//css_reference ICSharpCode.SharpZipLib.dll

namespace MyScrapers.Nordstorm
{
    public class Nordstorm : CustomScriptBase
    {

        private bool _imageNameSku = true;
        private string _hashCod;
        private string _domain = "https://www.nordstrom.com";
        private Product _product;
        private Category _category;
        private TaskInfo _mti;
        private GrabProcessState _gps;

       
        public override void Login(LoginScriptParameters p)
        {
            base.Login(p);
            p.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36";
            p.SessionCookieCollection.Add(new Cookie("rfx-forex-rate", "currencyCode=USD&exchangeRate=1", "/", ".nordstrom.com"));
            _mti = p.Process.m_ti;
            _gps = p.State as GrabProcessState;
        }

        public override GrabberSettings GetGrabberSettings()
        {
            var r = base.GetGrabberSettings();
            r.Settings.RequestAttempts = 20;
            r.Settings.UserParameters = "ApiMode=false[next]ChrapApi=http://93.84.109.210:11000[next]ChrapName=test3";
            r.Settings.CacheEnabled = false;
            r.Settings.RequestTimeout = 100000;
            r.Settings.BanDetectionString = "<title></title>";
            return r;
        }

        //public override void GrabCatalogBuild(GrabCatalogBuildScriptParameters p)
        //{
        //    Init(p);
        //}

        public override void ProcessFinished(ProcessFinishedScriptParameters p) { }
        public override void RunCategory(RunCategoryScriptParameters p) { }
       
        public override void GetProductLinksForCategory(GetProductLinksForCategoryScriptParameters p)
        {
            //Init(p);
            var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty();

            int pageCount = 0;
            int i = 1;
            while (true)
            {
                if (hpl.Load(p.Category.SourceUrl))
                {
                   
                    if (pageCount == 0)
                    {
                        List<string> pages = new List<string>();
                        TextUtils.GetHtmlValue(hpl.HtmlDoc, "//a[contains(@href, '?page=')]", "href", true, 0, true, out pages);
                        _mti.AddLogInfo("Pages:" + pages.Count());
                        foreach (var page in pages)
                        {
                            var pageNum = Convert.ToInt32(page.Replace("?page=", ""));
                            if (pageCount < pageNum)
                            {
                                pageCount = pageNum;
                            }
                        }

                    }
                    _mti.AddLogInfo("page count: " + pageCount);

                    var script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(), 'window.__INITIAL_CONFIG__ =')]", "", false).Replace("window.__INITIAL_CONFIG__ =", "");
                    if (!string.IsNullOrEmpty(script))
                    {
                        //_mti.AddLogInfo("script: " + script);
                        var jobjectScript = JObject.Parse(script);
                        foreach (var products in jobjectScript.SelectToken("$.productResults.productsById"))
                        {
                            foreach (var pr in products)
                            {
                                var link = pr.SelectToken("$.productPageUrl").ToString();
                                p.Category.ProductLinks.Add(_domain + link);
                               // _mti.AddLogInfo("product count: " + p.Category.ProductLinks.Count());

                                if (!_gps.GrabberSettings.Settings.IsMaxProductsInCategoryNull() && _gps.GrabberSettings.Settings.MaxProductsInCategory > 0 && p.Category.ProductLinks.Count >= _gps.GrabberSettings.Settings.MaxProductsInCategory)
                                {

                                    goto OUT_OF_FUNC;
                                }
                            }
                        }
                    }
                }

                i++;
                var nextPage = p.Category.SourceUrl + "&page=" + i;
                if (pageCount == 0 || pageCount == 1)
                {
                    break;
                }

                if (i >= pageCount)
                {
                    break;
                }

                if (!hpl.Load(nextPage))
                {
                    break;
                }
            }

            OUT_OF_FUNC:
            var m = "";
        }
        public override void RunProduct(RunProductScriptParameters p)
        {
            var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty();

            if (hpl.Load(p.Product.Url))
            {
                var productId = p.Product.Url.Split('/').Last().Split('?').First();
                _mti.AddLogInfo("PRODUCT ID:" + productId);

                var script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(), 'window.__INITIAL_CONFIG__ =')]", "", false).Replace("window.__INITIAL_CONFIG__ =", "");
                if (string.IsNullOrEmpty(script))
                {
                    _mti.AddLogInfo("script: " + script);
                }


                var images = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//section[@id='product-page-gallery']", "", false);
                List<string> imagesHtml = new List<string>();
                if(string.IsNullOrEmpty(images))
                {
                    TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[contains(@id, 'gallery-item-container-zoom')]/img", "src", true,0, true, out imagesHtml);
                }
                if (!string.IsNullOrEmpty(script))
                {
                    script = "{\"data\": " + script.Split(new string[] { "\"data\":" }, StringSplitOptions.None).Last();
                    script = script.Split(new string[] { ",\"styleBoard\":" }, StringSplitOptions.None).First();
                    var jobjectScript = JObject.Parse(script);
                    try
                    {
                        var productData = jobjectScript.SelectToken("$.data."+ productId);
                        p.Product.Art = productData.SelectToken("$.id").ToString();
                        p.Product.Name = productData.SelectToken("$.productName").ToString();
                        p.Product.Manufacturer = productData.SelectToken("$.brand.brandName").ToString();
                        p.Product.FullDescription = productData.SelectToken("$.description").ToString();
                        p.Product.Quantity = productData.SelectToken("$.maxOrderQuantity").ToString();
                        p.Product.Price = productData.SelectToken("$.price.style.priceString").ToString();

                        if (!string.IsNullOrEmpty(images))
                        {
                            var hDoc = new HtmlDocument();
                            hDoc.LoadHtml(images);
                            var node = hDoc.DocumentNode.SelectNodes(".//img");
                            if (node != null)
                            {
                                foreach (var n in node)
                                {
                                    var imageSrc = n.GetAttributeValue("src", string.Empty);
                                    _mti.AddLogInfo("Image link: " + imageSrc);
                                    p.Product.ImageAdd(imageSrc);
                                }
                            }
                            else
                            {
                                _mti.AddLogInfo("Node is null");
                            }
                            
                        }
                        else if(imagesHtml.Count() > 0)
                        {
                            foreach(var img in imagesHtml)
                            {
                                p.Product.ImageAdd(img);
                            }
                        }

                        if (productData.SelectToken(".skus.byId") != null)
                        {
                            List<Combination> Combinations = new List<Combination>();
                            var variants = productData.SelectToken("$.skus.byId");
                            var variantsOptions = productData.SelectToken("$.filterOptions");
                            foreach(JProperty v in variants)
                            {
                                var valueData = v.Value;
                                var combination = new Combination();
                                combination.Art = valueData.SelectToken("$.id").ToString();
                                combination.Price = valueData.SelectToken("$.price").ToString();
                                combination.Quantity = valueData.SelectToken("$.totalQuantityAvailable").ToString();
                                if(valueData.SelectToken("$.colorDisplayValue") != null && valueData.SelectToken("$.colorDisplayValue").ToString() != "null")
                                {
                                    combination.AddDynamicAttribute("Color", valueData.SelectToken("$.colorDisplayValue").ToString());
                                }

                                if(valueData.SelectToken("$.sizeDisplayValue") != null && valueData.SelectToken("$.sizeDisplayValue").ToString() != "null")
                                {
                                    combination.AddDynamicAttribute("Size", valueData.SelectToken("$.sizeDisplayValue").ToString());
                                }
                                var colorId = valueData.SelectToken("$.colorId").ToString();

                                //_mti.AddLogError("ColodId : " + colorId);

                                var combinationUrl = p.Product.Url + "?color=" + colorId;


                                if (hpl.Load(combinationUrl))
                                {
                                    var combinationImages = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//section[@id='product-page-gallery']", "", false);
                                    List<string> combinationImagesHtml = new List<string>();

                                    if (string.IsNullOrEmpty(combinationImages))
                                    {
                                        TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[contains(@id, 'gallery-item-container-zoom')]/img", "src", true, 0, true, out combinationImagesHtml);
                                    }

                                    if (!string.IsNullOrEmpty(combinationImages))
                                    {
                                        var hDoc = new HtmlDocument();
                                        hDoc.LoadHtml(combinationImages);
                                        var node = hDoc.DocumentNode.SelectNodes(".//img");
                                        if (node != null)
                                        {
                                            foreach (var n in node)
                                            {
                                                var combinationImageSrc = n.GetAttributeValue("src", string.Empty);
                                                //_mti.AddLogError("combination image link: " + combinationImageSrc);
                                                combination.ImageAdd(combinationImageSrc);
                                            }
                                        }
                                    }
                                    else if (combinationImagesHtml.Count() > 0)
                                    {
                                        foreach (var img in combinationImagesHtml)
                                        {
                                            combination.ImageAdd(img);
                                        }
                                    }
                                }
                                else
                                {
                                    _mti.AddLogError("Not loaded");
                                }

                                Combinations.Add(combination);
                            }

                            p.Product.CombinationsAdd(Combinations);
                        }
                    }
                    catch(Exception ex)
                    {
                        _mti.AddLogInfo(ex.Message);
                    }
                }
            }
            else
            {
                _mti.AddLogInfo("product not loaded..");

            }
            
        }

        
    }
}

Leave a Reply

Your email address will not be published.