Nordstorm web scraper c# project, find below sample c# code for this scraper.
It is just a variant of source code that we use for Nordstorm web scraper.
Hope that it will be interesting to check how we scrape Nordstorm web site and export real-time data.
using System; using System.Collections.Generic; using System.Globalization; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Web; using CatalogLoader; using CatalogLoaderCommon; using HtmlAgilityPack; using Newtonsoft.Json; using Newtonsoft.Json.Linq; using RestSharp; using RestSharp.Extensions; #if NET462 using Microsoft.CSharp; // for dynamics #endif //using NPOI.SS.Util; //css_reference NPOI.dll //css_reference NPOI.OOXML.dll //css_reference NPOI.OpenXml4Net.dll //css_reference NPOI.OpenXmlFormats.dll //css_reference ICSharpCode.SharpZipLib.dll namespace MyScrapers.Nordstorm { public class Nordstorm : CustomScriptBase { private bool _imageNameSku = true; private string _hashCod; private string _domain = "https://www.nordstrom.com"; private Product _product; private Category _category; private TaskInfo _mti; private GrabProcessState _gps; public override void Login(LoginScriptParameters p) { base.Login(p); p.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36"; p.SessionCookieCollection.Add(new Cookie("rfx-forex-rate", "currencyCode=USD&exchangeRate=1", "/", ".nordstrom.com")); _mti = p.Process.m_ti; _gps = p.State as GrabProcessState; } public override GrabberSettings GetGrabberSettings() { var r = base.GetGrabberSettings(); r.Settings.RequestAttempts = 20; r.Settings.UserParameters = "ApiMode=false[next]ChrapApi=http://93.84.109.210:11000[next]ChrapName=test3"; r.Settings.CacheEnabled = false; r.Settings.RequestTimeout = 100000; r.Settings.BanDetectionString = "<title></title>"; return r; } //public override void GrabCatalogBuild(GrabCatalogBuildScriptParameters p) //{ // Init(p); //} public override void ProcessFinished(ProcessFinishedScriptParameters p) { } public override void RunCategory(RunCategoryScriptParameters p) { } public override void GetProductLinksForCategory(GetProductLinksForCategoryScriptParameters p) { //Init(p); var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty(); int pageCount = 0; int i = 1; while (true) { if (hpl.Load(p.Category.SourceUrl)) { if (pageCount == 0) { List<string> pages = new List<string>(); TextUtils.GetHtmlValue(hpl.HtmlDoc, "//a[contains(@href, '?page=')]", "href", true, 0, true, out pages); _mti.AddLogInfo("Pages:" + pages.Count()); foreach (var page in pages) { var pageNum = Convert.ToInt32(page.Replace("?page=", "")); if (pageCount < pageNum) { pageCount = pageNum; } } } _mti.AddLogInfo("page count: " + pageCount); var script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(), 'window.__INITIAL_CONFIG__ =')]", "", false).Replace("window.__INITIAL_CONFIG__ =", ""); if (!string.IsNullOrEmpty(script)) { //_mti.AddLogInfo("script: " + script); var jobjectScript = JObject.Parse(script); foreach (var products in jobjectScript.SelectToken("$.productResults.productsById")) { foreach (var pr in products) { var link = pr.SelectToken("$.productPageUrl").ToString(); p.Category.ProductLinks.Add(_domain + link); // _mti.AddLogInfo("product count: " + p.Category.ProductLinks.Count()); if (!_gps.GrabberSettings.Settings.IsMaxProductsInCategoryNull() && _gps.GrabberSettings.Settings.MaxProductsInCategory > 0 && p.Category.ProductLinks.Count >= _gps.GrabberSettings.Settings.MaxProductsInCategory) { goto OUT_OF_FUNC; } } } } } i++; var nextPage = p.Category.SourceUrl + "&page=" + i; if (pageCount == 0 || pageCount == 1) { break; } if (i >= pageCount) { break; } if (!hpl.Load(nextPage)) { break; } } OUT_OF_FUNC: var m = ""; } public override void RunProduct(RunProductScriptParameters p) { var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty(); if (hpl.Load(p.Product.Url)) { var productId = p.Product.Url.Split('/').Last().Split('?').First(); _mti.AddLogInfo("PRODUCT ID:" + productId); var script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(), 'window.__INITIAL_CONFIG__ =')]", "", false).Replace("window.__INITIAL_CONFIG__ =", ""); if (string.IsNullOrEmpty(script)) { _mti.AddLogInfo("script: " + script); } var images = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//section[@id='product-page-gallery']", "", false); List<string> imagesHtml = new List<string>(); if(string.IsNullOrEmpty(images)) { TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[contains(@id, 'gallery-item-container-zoom')]/img", "src", true,0, true, out imagesHtml); } if (!string.IsNullOrEmpty(script)) { script = "{\"data\": " + script.Split(new string[] { "\"data\":" }, StringSplitOptions.None).Last(); script = script.Split(new string[] { ",\"styleBoard\":" }, StringSplitOptions.None).First(); var jobjectScript = JObject.Parse(script); try { var productData = jobjectScript.SelectToken("$.data."+ productId); p.Product.Art = productData.SelectToken("$.id").ToString(); p.Product.Name = productData.SelectToken("$.productName").ToString(); p.Product.Manufacturer = productData.SelectToken("$.brand.brandName").ToString(); p.Product.FullDescription = productData.SelectToken("$.description").ToString(); p.Product.Quantity = productData.SelectToken("$.maxOrderQuantity").ToString(); p.Product.Price = productData.SelectToken("$.price.style.priceString").ToString(); if (!string.IsNullOrEmpty(images)) { var hDoc = new HtmlDocument(); hDoc.LoadHtml(images); var node = hDoc.DocumentNode.SelectNodes(".//img"); if (node != null) { foreach (var n in node) { var imageSrc = n.GetAttributeValue("src", string.Empty); _mti.AddLogInfo("Image link: " + imageSrc); p.Product.ImageAdd(imageSrc); } } else { _mti.AddLogInfo("Node is null"); } } else if(imagesHtml.Count() > 0) { foreach(var img in imagesHtml) { p.Product.ImageAdd(img); } } if (productData.SelectToken(".skus.byId") != null) { List<Combination> Combinations = new List<Combination>(); var variants = productData.SelectToken("$.skus.byId"); var variantsOptions = productData.SelectToken("$.filterOptions"); foreach(JProperty v in variants) { var valueData = v.Value; var combination = new Combination(); combination.Art = valueData.SelectToken("$.id").ToString(); combination.Price = valueData.SelectToken("$.price").ToString(); combination.Quantity = valueData.SelectToken("$.totalQuantityAvailable").ToString(); if(valueData.SelectToken("$.colorDisplayValue") != null && valueData.SelectToken("$.colorDisplayValue").ToString() != "null") { combination.AddDynamicAttribute("Color", valueData.SelectToken("$.colorDisplayValue").ToString()); } if(valueData.SelectToken("$.sizeDisplayValue") != null && valueData.SelectToken("$.sizeDisplayValue").ToString() != "null") { combination.AddDynamicAttribute("Size", valueData.SelectToken("$.sizeDisplayValue").ToString()); } var colorId = valueData.SelectToken("$.colorId").ToString(); //_mti.AddLogError("ColodId : " + colorId); var combinationUrl = p.Product.Url + "?color=" + colorId; if (hpl.Load(combinationUrl)) { var combinationImages = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//section[@id='product-page-gallery']", "", false); List<string> combinationImagesHtml = new List<string>(); if (string.IsNullOrEmpty(combinationImages)) { TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[contains(@id, 'gallery-item-container-zoom')]/img", "src", true, 0, true, out combinationImagesHtml); } if (!string.IsNullOrEmpty(combinationImages)) { var hDoc = new HtmlDocument(); hDoc.LoadHtml(combinationImages); var node = hDoc.DocumentNode.SelectNodes(".//img"); if (node != null) { foreach (var n in node) { var combinationImageSrc = n.GetAttributeValue("src", string.Empty); //_mti.AddLogError("combination image link: " + combinationImageSrc); combination.ImageAdd(combinationImageSrc); } } } else if (combinationImagesHtml.Count() > 0) { foreach (var img in combinationImagesHtml) { combination.ImageAdd(img); } } } else { _mti.AddLogError("Not loaded"); } Combinations.Add(combination); } p.Product.CombinationsAdd(Combinations); } } catch(Exception ex) { _mti.AddLogInfo(ex.Message); } } } else { _mti.AddLogInfo("product not loaded.."); } } } }