Nordstorm web scraper c# project, find below sample c# code for this scraper.
It is just a variant of source code that we use for Nordstorm web scraper.
Hope that it will be interesting to check how we scrape Nordstorm web site and export real-time data.
[code lang=”csharp”]
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using CatalogLoader;
using CatalogLoaderCommon;
using HtmlAgilityPack;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using RestSharp;
using RestSharp.Extensions;
#if NET462
using Microsoft.CSharp; // for dynamics
#endif
//using NPOI.SS.Util;
//css_reference NPOI.dll
//css_reference NPOI.OOXML.dll
//css_reference NPOI.OpenXml4Net.dll
//css_reference NPOI.OpenXmlFormats.dll
//css_reference ICSharpCode.SharpZipLib.dll
namespace MyScrapers.Nordstorm
{
public class Nordstorm : CustomScriptBase
{
private bool _imageNameSku = true;
private string _hashCod;
private string _domain = "https://www.nordstrom.com";
private Product _product;
private Category _category;
private TaskInfo _mti;
private GrabProcessState _gps;
public override void Login(LoginScriptParameters p)
{
base.Login(p);
p.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36";
p.SessionCookieCollection.Add(new Cookie("rfx-forex-rate", "currencyCode=USD&exchangeRate=1", "/", ".nordstrom.com"));
_mti = p.Process.m_ti;
_gps = p.State as GrabProcessState;
}
public override GrabberSettings GetGrabberSettings()
{
var r = base.GetGrabberSettings();
r.Settings.RequestAttempts = 20;
r.Settings.UserParameters = "ApiMode=false[next]ChrapApi=http://93.84.109.210:11000[next]ChrapName=test3";
r.Settings.CacheEnabled = false;
r.Settings.RequestTimeout = 100000;
r.Settings.BanDetectionString = "<title></title>";
return r;
}
//public override void GrabCatalogBuild(GrabCatalogBuildScriptParameters p)
//{
// Init(p);
//}
public override void ProcessFinished(ProcessFinishedScriptParameters p) { }
public override void RunCategory(RunCategoryScriptParameters p) { }
public override void GetProductLinksForCategory(GetProductLinksForCategoryScriptParameters p)
{
//Init(p);
var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty();
int pageCount = 0;
int i = 1;
while (true)
{
if (hpl.Load(p.Category.SourceUrl))
{
if (pageCount == 0)
{
List<string> pages = new List<string>();
TextUtils.GetHtmlValue(hpl.HtmlDoc, "//a[contains(@href, ‘?page=’)]", "href", true, 0, true, out pages);
_mti.AddLogInfo("Pages:" + pages.Count());
foreach (var page in pages)
{
var pageNum = Convert.ToInt32(page.Replace("?page=", ""));
if (pageCount < pageNum)
{
pageCount = pageNum;
}
}
}
_mti.AddLogInfo("page count: " + pageCount);
var script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(), ‘window.__INITIAL_CONFIG__ =’)]", "", false).Replace("window.__INITIAL_CONFIG__ =", "");
if (!string.IsNullOrEmpty(script))
{
//_mti.AddLogInfo("script: " + script);
var jobjectScript = JObject.Parse(script);
foreach (var products in jobjectScript.SelectToken("$.productResults.productsById"))
{
foreach (var pr in products)
{
var link = pr.SelectToken("$.productPageUrl").ToString();
p.Category.ProductLinks.Add(_domain + link);
// _mti.AddLogInfo("product count: " + p.Category.ProductLinks.Count());
if (!_gps.GrabberSettings.Settings.IsMaxProductsInCategoryNull() && _gps.GrabberSettings.Settings.MaxProductsInCategory > 0 && p.Category.ProductLinks.Count >= _gps.GrabberSettings.Settings.MaxProductsInCategory)
{
goto OUT_OF_FUNC;
}
}
}
}
}
i++;
var nextPage = p.Category.SourceUrl + "&page=" + i;
if (pageCount == 0 || pageCount == 1)
{
break;
}
if (i >= pageCount)
{
break;
}
if (!hpl.Load(nextPage))
{
break;
}
}
OUT_OF_FUNC:
var m = "";
}
public override void RunProduct(RunProductScriptParameters p)
{
var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty();
if (hpl.Load(p.Product.Url))
{
var productId = p.Product.Url.Split(‘/’).Last().Split(‘?’).First();
_mti.AddLogInfo("PRODUCT ID:" + productId);
var script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(), ‘window.__INITIAL_CONFIG__ =’)]", "", false).Replace("window.__INITIAL_CONFIG__ =", "");
if (string.IsNullOrEmpty(script))
{
_mti.AddLogInfo("script: " + script);
}
var images = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//section[@id=’product-page-gallery’]", "", false);
List<string> imagesHtml = new List<string>();
if(string.IsNullOrEmpty(images))
{
TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[contains(@id, ‘gallery-item-container-zoom’)]/img", "src", true,0, true, out imagesHtml);
}
if (!string.IsNullOrEmpty(script))
{
script = "{\"data\": " + script.Split(new string[] { "\"data\":" }, StringSplitOptions.None).Last();
script = script.Split(new string[] { ",\"styleBoard\":" }, StringSplitOptions.None).First();
var jobjectScript = JObject.Parse(script);
try
{
var productData = jobjectScript.SelectToken("$.data."+ productId);
p.Product.Art = productData.SelectToken("$.id").ToString();
p.Product.Name = productData.SelectToken("$.productName").ToString();
p.Product.Manufacturer = productData.SelectToken("$.brand.brandName").ToString();
p.Product.FullDescription = productData.SelectToken("$.description").ToString();
p.Product.Quantity = productData.SelectToken("$.maxOrderQuantity").ToString();
p.Product.Price = productData.SelectToken("$.price.style.priceString").ToString();
if (!string.IsNullOrEmpty(images))
{
var hDoc = new HtmlDocument();
hDoc.LoadHtml(images);
var node = hDoc.DocumentNode.SelectNodes(".//img");
if (node != null)
{
foreach (var n in node)
{
var imageSrc = n.GetAttributeValue("src", string.Empty);
_mti.AddLogInfo("Image link: " + imageSrc);
p.Product.ImageAdd(imageSrc);
}
}
else
{
_mti.AddLogInfo("Node is null");
}
}
else if(imagesHtml.Count() > 0)
{
foreach(var img in imagesHtml)
{
p.Product.ImageAdd(img);
}
}
if (productData.SelectToken(".skus.byId") != null)
{
List<Combination> Combinations = new List<Combination>();
var variants = productData.SelectToken("$.skus.byId");
var variantsOptions = productData.SelectToken("$.filterOptions");
foreach(JProperty v in variants)
{
var valueData = v.Value;
var combination = new Combination();
combination.Art = valueData.SelectToken("$.id").ToString();
combination.Price = valueData.SelectToken("$.price").ToString();
combination.Quantity = valueData.SelectToken("$.totalQuantityAvailable").ToString();
if(valueData.SelectToken("$.colorDisplayValue") != null && valueData.SelectToken("$.colorDisplayValue").ToString() != "null")
{
combination.AddDynamicAttribute("Color", valueData.SelectToken("$.colorDisplayValue").ToString());
}
if(valueData.SelectToken("$.sizeDisplayValue") != null && valueData.SelectToken("$.sizeDisplayValue").ToString() != "null")
{
combination.AddDynamicAttribute("Size", valueData.SelectToken("$.sizeDisplayValue").ToString());
}
var colorId = valueData.SelectToken("$.colorId").ToString();
//_mti.AddLogError("ColodId : " + colorId);
var combinationUrl = p.Product.Url + "?color=" + colorId;
if (hpl.Load(combinationUrl))
{
var combinationImages = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//section[@id=’product-page-gallery’]", "", false);
List<string> combinationImagesHtml = new List<string>();
if (string.IsNullOrEmpty(combinationImages))
{
TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[contains(@id, ‘gallery-item-container-zoom’)]/img", "src", true, 0, true, out combinationImagesHtml);
}
if (!string.IsNullOrEmpty(combinationImages))
{
var hDoc = new HtmlDocument();
hDoc.LoadHtml(combinationImages);
var node = hDoc.DocumentNode.SelectNodes(".//img");
if (node != null)
{
foreach (var n in node)
{
var combinationImageSrc = n.GetAttributeValue("src", string.Empty);
//_mti.AddLogError("combination image link: " + combinationImageSrc);
combination.ImageAdd(combinationImageSrc);
}
}
}
else if (combinationImagesHtml.Count() > 0)
{
foreach (var img in combinationImagesHtml)
{
combination.ImageAdd(img);
}
}
}
else
{
_mti.AddLogError("Not loaded");
}
Combinations.Add(combination);
}
p.Product.CombinationsAdd(Combinations);
}
}
catch(Exception ex)
{
_mti.AddLogInfo(ex.Message);
}
}
}
else
{
_mti.AddLogInfo("product not loaded..");
}
}
}
}
[/code]