MyDataProvider » Blog » walgreens c# webscraper

walgreens c# webscraper

  • by

[code lang=”csharp”]

using System;
using System.Collections.Generic;
using System.Drawing;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using CatalogLoader;
using CatalogLoaderCommon;
using HtmlAgilityPack;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;

//css_reference HtmlAgilityPack;
//css_reference NPOI.dll
//css_reference NPOI.OOXML.dll
//css_reference NPOI.OpenXml4Net.dll
//css_reference NPOI.OpenXmlFormats.dll
//css_reference ICSharpCode.SharpZipLib.dll

namespace CatalogLoaderVSScriptEditor
{
public class CustomScript : CustomScriptBase
{
public HtmlPageLoader HplProduct { get; set; }
public HtmlPageLoader HplCatalog { get; set; }
public HtmlPageLoader HplLinks { get; set; }

private bool _imageNameSku;
private string _hashCod;
private string _domen;
private Product _product;
private Category _category;
private TaskInfo _mti;
string _domain = "https://www.walgreens.com";

private GrabProcessState _gps;

public CustomScript() { }

public CustomScript(Category mroot)
{
_category = mroot;
}

public CustomScript(Product product)
{
_product = product;
}

List<string> _upcList = new List<string>();

public override void Login(LoginScriptParameters p)
{
_gps = p.State as GrabProcessState;
_mti = p.Process.m_ti;

string inputFilePath = Path.Combine(UtilSmall.ApplicationDataDirectory, "InputFile.txt");//_gps.GrabberSettings.UserParameterGet("InputFile");
FromFileRead(inputFilePath);
//var client = new WebClientWithCookies();
//const string loginUrl = @"";
//client.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
//client.Headers.Add("Accept-Language", "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3");
//client.Headers.Add("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0");
//client.Headers.Add("Content-Type", "application/x-www-form-urlencoded");
//client.Headers.Add("Host", new Uri(loginUrl).Host);
//client.Headers.Add("Referer", loginUrl);
//client.Encoding = Encoding.UTF8;

//const string data = @"";
//var str = client.UploadString(loginUrl, "POST", data);

//if (str.Contains("logout"))
//{
// p.SessionCookieCollection = client.SessionCookieContainer.GetCookies(new Uri(loginUrl));
//}
//else
//{
// throw new Exception("was not logged in …");
//}
}

public void FromFileRead(string filePath)
{
_upcList = File.ReadAllLines(filePath).ToList();
}
public override void GrabCatalogBuild(GrabCatalogBuildScriptParameters p)
{
//Init(p);
_mti = p.Process.m_ti;
_category = new Category();

var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty();

hpl.Load(_domain);
var script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(),’__HEADER_INITIAL_STATE__’)]", "", false);
var regex = new Regex(@"{[\w\W]+}};");
script = regex.Match(script).Value.TrimEnd(‘;’);
var jobjectScript = JObject.Parse(script);

if(jobjectScript.SelectToken("$.header.headNavData.menu-shop-products.categories") != null)
{
foreach(var cat in jobjectScript.SelectToken("$.header.headNavData.menu-shop-products.categories"))
{
var category = new Category();
category.Name = cat.SelectToken("$.name").ToString();
category.SourceUrl = _domain + cat.SelectToken("$.url").ToString();
_category.AddCategory(category);
if(cat.SelectToken("$.categories") != null)
{
SubCategoryAdd(cat.SelectToken("$.categories"), category);
}
}
}

/* List<string> Link = new List<string>();
List<string> Name = new List<string>();
var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty();
var CurrentUrl = _domain;
hpl.Load(CurrentUrl);

TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[@class=\"menu\"]//li[position() >1 and position() < 7]/a[@class=\"level-top\"]/span", "", false, 0, true, out Name);
TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[@class=\"menu\"]//li[position() >1 and position() < 7]/a[@class=\"level-top\"]", "href", true, 0, true, out Link);
var category = new Category();

if (Link.Count != 0 && Link.Count == Name.Count)
{
for (int i = 0; i < Link.Count; i++)
{
category.Name = Name[i];
category.SourceUrl = Link[i];
SubCategoryAdd(category, hpl, "");
_category.AddCategory(category);
}
}*/

p.Root = _category;
}

public void SubCategoryAdd(JToken script, Category parent)
{
foreach(var cat in script)
{
var category = new Category();
category.Name = cat.SelectToken("$.name").ToString();
category.SourceUrl = _domain + cat.SelectToken("$.url").ToString();
parent.AddCategory(category);
if(cat.SelectToken("$.categories") != null)
{
SubCategoryAdd(cat.SelectToken("$.categories"), category);
}
}
}

public override void ProcessFinished(ProcessFinishedScriptParameters p) { }
public override void RunCategory(RunCategoryScriptParameters p) { }

public override void GetProductLinksForCategory(GetProductLinksForCategoryScriptParameters p)
{
// Init(p);
// _mti = new TaskInfo();
//https://www.walgreens.com/search/results.jsp?Ntt=300411848

var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty();

foreach (var upc in _upcList)
{
_mti.AddLogInfo("Upc:" + upc);

hpl.Load("https://www.walgreens.com/search/results.jsp?Ntt=" + upc);
//product-container
var productLink = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//ul[@class=’product-container’]/li[@class=’item card card__product’]//a", "href", true);
p.Category.ProductLinks.Add(_domain + productLink);
_mti.AddLogInfo("product link count:" + p.Category.ProductLinks.Count());

}
//var CurrentUrl = p.Category.SourceUrl;

// hpl.Load(CurrentUrl);
// /*if (hpl.Content.Contains(""))
// {
// hpl.Load(CurrentUrl);
// }
// */
// var regex = new Regex(@"[0-9]{3,}");
// var N = regex.Match(p.Category.SourceUrl).Value;

// var productCount = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//p[@id=’resultcount’]/strong", "", false);
// if(string.IsNullOrEmpty(productCount))
// {
// productCount = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[@id=’resultcount’]/strong", "", false);
// }
// while (true)
// {
// if(!_mti.CanContinue())
// {
// break;
// }

// var script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(), ‘window.__APP_INITIAL_STATE__’)]", "", false).Replace("window.__APP_INITIAL_STATE__ = ", "").TrimEnd(‘;’);
// if(string.IsNullOrEmpty(script))
// {
// script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(), ‘window.getInitialState=function’)]", "", false).Replace("window.getInitialState=function(){ return ", "").TrimEnd(‘;’);
// }
// script = script.TrimEnd(‘}’) + "}}";
// var jObjectScript = JObject.Parse(script);
// if (jObjectScript.SelectTokens("$.searchResult.productList") != null || jObjectScript.SelectTokens("$.searchResult.productList").Count() != 0)
// {

// foreach (var productList in jObjectScript.SelectToken("$.searchResult.productList"))
// {
// p.Category.ProductLinks.Add(_domain + productList.SelectToken("$.productInfo.productURL").ToString());

// if (!_gps.GrabberSettings.Settings.IsMaxProductsInCategoryNull() && _gps.GrabberSettings.Settings.MaxProductsInCategory > 0 && p.Category.ProductLinks.Count >= _gps.GrabberSettings.Settings.MaxProductsInCategory)
// {

// goto OUT_OF_FUNC;
// }
// }
// }
// else
// {

// break;
// }
// if (Convert.ToInt32(productCount) > 80)
// {

// CurrentUrl = "https://www.walgreens.com/store/store/category/productlist.jsp?webExc=true&N=" + N + "&No=" + p.Category.ProductLinks.Count();
// if (!hpl.Load(CurrentUrl))
// {
// break;
// }
// }
// else
// {
// break;
// }
// if (p.Category.ProductLinks.Count() >= Convert.ToInt32(productCount))
// {
// break;
// }
// }
//OUT_OF_FUNC:
// var k = "";

}

public override void RunProduct(RunProductScriptParameters p)
{

p.Product.BarCode = p.Product.Url.Split(new string[] { "Ntt=" }, StringSplitOptions.None).Last();

string categoryPath = p.Product.GetAttributeValue("CATEGORY_PATH");
if(!string.IsNullOrEmpty(categoryPath))
{
var hDoc = new HtmlDocument();
hDoc.LoadHtml(categoryPath);
var node = hDoc.DocumentNode.SelectNodes("./li");

List<string> categoryLinks = new List<string>();
List<string> categoryName = new List<string>();

foreach(var n in node)
{
categoryLinks.Add("https://#");
categoryName.Add(n.SelectSingleNode("./a").InnerHtml);
}

var c = new Category();
var cLast = CategoriesAdd(0, categoryName, categoryLinks, c);
p.Product.Category = cLast;
}
var k = "";
//string imagesHtml = p.Product.GetAttributeValue("IMAGES_HTML");
//var script = p.Product.GetAttributeValue("Script");
// string productData = p.Product.GetAttributeValue("PRODUCT_DATA_HTML");
// string script = p.Product.GetAttributeValue("SCRIPT_HTML");
// var size = p.Product.GetAttributeValue("SIZE_HTML");
// string inStock = p.Product.GetAttributeValue("IN_STOCK");
// string productFullSize = p.Product.GetAttributeValue("PRODUCT_FULL_SIZE");

// var art = p.Product.Art;
// if(art.Contains("react-text"))
// {
// var regex = new Regex(@"[0-9]{3,}");
// art = regex.Match(art).Value;
// p.Product.Art = art;

// }
///* var price = p.Product.Price;
// if(price.Contains("<sup>"))
// {
// var regex = new Regex(@"[0-9]+");
// var pr = regex.Matches(price);
// p.Product.Price = pr[0].ToString() + "." + pr[1].ToString();

// }*/

// var weight = p.Product.Weight;
// if(weight.Contains("react-text"))
// {
// var regex = new Regex("–>"+@"[0-9]+\W*[0-9]*");
// var NewWeight = regex.Match(weight).Value.Replace("–>","");
// p.Product.Weight = NewWeight;
// }
//// art = art.

//if(!string.IsNullOrEmpty(productFullSize))
// {
// p.Product.AddDynamicAttribute("Size of the package", productFullSize);
// }

// var name = p.Product.Name;

// var hDoc = new HtmlDocument();

///* if(!string.IsNullOrEmpty(imagesHtml))
// {
// hDoc.LoadHtml(imagesHtml);
// ImageClp imageClp;
// List<ImageClp> images = new List<ImageClp>();

// List<string> LinksToImages;
// TextUtils.GetHtmlValue(hDoc, "//img", "src", true, 0, true, out LinksToImages);

// foreach (var link in LinksToImages)
// {
// imageClp = new ImageClp();
// if(LinksToImages.Count == 1)
// {
// imageClp.Url = link;
// }
// else
// {
// imageClp.Url = "https:" + link.Replace("100", "900");
// }
// images.Add(imageClp);
// }

// p.Product.ImageSafeAdd(images);

// }
// else//інколи не загружається фотограція через v4 пробую по іншому.
// {
// var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty();
// var CurrentUrl = p.Product.Url;
// hpl.Load(CurrentUrl);
// ImageClp imageClp;
// List<ImageClp> images = new List<ImageClp>();
// List<string> LinksToImages = new List<string>();
// TextUtils.GetHtmlValue(hpl.HtmlDoc, "//ul[@id=’thumbnailImages’]//img", "src", true, 0, true, out LinksToImages);
// if(LinksToImages.Count == 0)
// {
// TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[@id=’zoomLensContainer’]//img", "src", true, 0, true, out LinksToImages);
// }
// foreach (var link in LinksToImages)
// {
// imageClp = new ImageClp();
// if (LinksToImages.Count == 1)
// {
// imageClp.Url = link;
// }
// else
// {
// imageClp.Url = "https:" + link.Replace("100", "900");
// }
// images.Add(imageClp);
// }

// p.Product.ImageSafeAdd(images);

// }*/
// if(!string.IsNullOrEmpty(inStock))
// {
// p.Product.Quantity = "1";
// }
// else
// {
// p.Product.Quantity = "15";
// }
// if(!string.IsNullOrEmpty(size))
// {
// if (size.Contains("react-text"))
// {
// var regex = new Regex(@"–>\d+[\w\W]+");
// size = regex.Match(size).Value.Replace("–>", "");
// var sizes1 = size.Split(‘x’);
// p.Product.Width = sizes1[1];
// p.Product.Height = sizes1[2];
// p.Product.Depth = sizes1[0];

// }
// else
// {
// var sizes = size.Split(‘x’);
// p.Product.Width = sizes[1];
// p.Product.Height = sizes[2];
// p.Product.Depth = sizes[0];
// }
// }

// if (!string.IsNullOrEmpty(script))
// {
// /*var regex = new Regex(@"{[\w\W]+}}};");
// script = regex.Match(script).Value.TrimEnd(‘;’);

// regex = new Regex("%3" + @"[\w\W]+%20");

// var description = regex.Match(script).Value;

// var jObjectScript = JObject.Parse(script);*/

// /*if (jObjectScript.SelectToken("$.product.results.prodDetails.section.[0].description.productDesc") != null)
// {
// var Descriptions = jObjectScript.SelectToken("$.product.results.prodDetails.section.[0].description.productDesc").ToString();
// Descriptions = System.Uri.UnescapeDataString(Descriptions);
// p.Product.FullDescription = Descriptions;*/
// var regex = new Regex("\"productDesc\":\"" + @"[\w\W]+" + "\",\"quickView\"");
// string Descriptions = regex.Match(script).Value;

// if(!string.IsNullOrEmpty(Descriptions))
// {
// script = script.Replace(Descriptions.Replace("\"quickView\"", ""), "");
// }

// //regex = new Regex(",{\"warnings\"" + @"[\w\W]+" + "\"}}");
// //var tmp = regex.Match(script).Value;
// regex = new Regex(@"{[\w\W]+}}};");
// script = regex.Match(script).Value.TrimEnd(‘;’);
// regex = new Regex("\"product\"" + @"[\w\W]+" + ",\"shippingOverlay\"");
// script = "{" + regex.Match(script).Value.Replace(",\"shippingOverlay\"", "") + "}";
// regex = new Regex(",{\"warnings\"" + @"[\w\W]+" + ",{\"shipping\"");
// var tmp = regex.Match(script).Value.Replace(",{\"shipping\"", "");
// if (!string.IsNullOrEmpty(tmp))
// {
// script = script.Replace(tmp, "");
// }
// regex = new Regex("\"description\":{\"" + @"[\w\W]+" + "}]},\"productInfo\"");
// tmp = regex.Match(script).Value.Replace("}]},\"productInfo\"", "");
// if (!string.IsNullOrEmpty(tmp))
// {
// script = script.Replace(tmp, "");
// }
// var jObjectScript = JObject.Parse(script);

// if (jObjectScript.SelectToken("$.product.results.priceInfo.salePrice") != null)
// {
// p.Product.Price = jObjectScript.SelectToken("$.product.results.priceInfo.salePrice").ToString().Replace("$", "");
// p.Product.PriceOld = jObjectScript.SelectToken("$.product.results.priceInfo.regularPrice").ToString().Replace("$","");
// }
// else
// {
// p.Product.Price = jObjectScript.SelectToken("$.product.results.priceInfo.regularPrice").ToString().Replace("$","");
// }

// if(jObjectScript.SelectToken("$.product.results.productInfo.filmStripUrl") != null)
// {
// int counter = 1;
// foreach(var images in jObjectScript.SelectToken("$.product.results.productInfo.filmStripUrl"))
// {
// if(images.SelectToken("$.zoomImageUrl" + counter)!= null)
// {
// p.Product.ImageAdd("https:" +images.SelectToken("$.zoomImageUrl" + counter).ToString());
// }
// else if(images.SelectToken("$.largeImageUrl" + counter) != null)
// {
// p.Product.ImageAdd("https:" + images.SelectToken("$.largeImageUrl" + counter).ToString());
// }
// counter++;
// }
// }

// var combinationJObject = jObjectScript.SelectToken("$.product.results.inventory.relatedProducts");
// List<Combination> ProductCombination = new List<Combination>();

// if (combinationJObject.Count() > 0)
// {
// foreach(var com in combinationJObject)
// {

// JProperty jProperty = com.ToObject<JProperty>();
// foreach (var comData in com)
// {
// foreach(var data in comData )
// {
// var combination = new Combination();
// var k = data.SelectToken("$.value").ToString();
// if (data.SelectToken("$.value") != null)
// {
// combination.AddDynamicAttribute(jProperty.Name.ToString(), data.SelectToken("$.value").ToString());
// }
// combination.Price = data.SelectToken("$.priceInfo.regularPrice").ToString().TrimStart(‘$’);
// if (data.SelectToken("$.isavlbl").ToString() == "yes")
// {
// combination.Quantity = "10";
// }
// else
// {
// combination.Quantity = "1";
// }
// if (data.SelectToken("$.key") != null)
// {
// combination.Art = data.SelectToken("$.key").ToString().Replace("sku", "");
// }
// ProductCombination.Add(combination);
// }

// }

// }
// }
// p.Product.CombinationsAdd(ProductCombination);

// if (!string.IsNullOrEmpty(Descriptions))
// {
// Descriptions = Descriptions.Replace("\"productDesc\":\"","").Replace("\",\"quickView\"", "");
// }

// Descriptions = System.Uri.UnescapeDataString(Descriptions);
// p.Product.FullDescription = Descriptions;
// if (!string.IsNullOrEmpty(productData))
// {
// hDoc.LoadHtml(productData);
// var text = TextUtils.GetHtmlValue(hDoc, "//li[@id=’Ingredients’]", "", false);
// if(!string.IsNullOrEmpty(text))
// {
// p.Product.FullDescription += text;
// }
// text = TextUtils.GetHtmlValue(hDoc, "//li[@id=’Warnings’]", "", false);
// if(!string.IsNullOrEmpty(text))
// {
// p.Product.FullDescription +=text;
// }

// }
// //}
// }
}

public Category CategoriesAdd(int index, List<string> CategoriesNames, List<string> CategoriesLinks, Category parent)
{
//var last = new Category();
var category = new Category();
if (index < CategoriesLinks.Count())
{
category.Name = CategoriesNames[index];
category.SourceUrl = CategoriesLinks[index];
parent.AddCategory(category);
index++;
var last = CategoriesAdd(index, CategoriesNames, CategoriesLinks, category);
return last;

}
else
{
return parent;
}

}

#region Settings

private void Init(RunProductScriptParameters p)
{
_mti = p.Process.m_ti;
_product = p.Product;
_category = p.Category;

if (HplProduct != null) return;

var hpl = p.Process as OneProductLoader;

if (string.IsNullOrEmpty(_domen))
_domen = hpl.State.GrabberSettings.Settings.ShopUrl;

if (hpl != null)
HplProduct = Helper.Hpl = hpl.State.Proxy.GetHtmlPageLoader(_domen);
}

private void Init(GetProductLinksForCategoryScriptParameters p)
{
_mti = p.Process.m_ti;
_category = p.Category;

if (HplLinks != null) return;

var hpl = p.Process as OneCategoryLoader;

if (string.IsNullOrEmpty(_domen))
_domen = hpl.State.GrabberSettings.Settings.ShopUrl;

if (hpl != null)
HplLinks = hpl.State.Proxy.GetHtmlPageLoader(_domen);
}

private void Init(GrabCatalogBuildScriptParameters p)
{
_mti = p.Process.m_ti;
_category = new Category { ID = "0" };

if (HplCatalog != null) return;

var hpl = p.Process as GrabCatalogFromWeb;

if (string.IsNullOrEmpty(_domen))
_domen = hpl.State.GrabberSettings.Settings.ShopUrl;

if (hpl != null)
HplCatalog = hpl.State.Proxy.GetHtmlPageLoader(_domen);
}

private void StartProduct(Product product)
{
product.Name = HttpUtility.HtmlDecode(product.Name);
_hashCod = Helper.HashCod = Helper.GetHashCodeString(product.Name + product.Url);

if (!string.IsNullOrEmpty(product.FullDescription))
{
product.FullDescription = product.FullDescription.RemoveTagAttr("id[next]class");
product.FullDescription = Regex.Replace(product.FullDescription, @"\r|\n|\t|\s{2,}", "");
}

// удаляем аттрибуты тегов в кратком описании
if (!string.IsNullOrEmpty(product.SmallDescription))
{
product.SmallDescription = product.SmallDescription.RemoveTagAttr("id[next]class");
product.SmallDescription = Regex.Replace(product.SmallDescription, @"\r|\n|\t|\s{2,}", "");
}

if (string.IsNullOrEmpty(product.ID))
product.ID = _hashCod;

if (string.IsNullOrEmpty(product.GetAttributeValue("SYS_PRODUCT_ART")))
product.SetAttributeValue("SYS_PRODUCT_ART", product.ID);

product.ImageAdd(string.IsNullOrEmpty(product.GetAttributeValue("SYS_PRODUCT_MAIN_IMAGE"))
? product.GetAttributeValue("SYS_PRODUCT_MAIN_IMAGE_SMALL")
: product.GetAttributeValue("SYS_PRODUCT_MAIN_IMAGE"));

if (_imageNameSku)
product.Image = Helper.MakeImgName2Sku(product.ImageFull, 0);

product.Price = product.Price.GetTruePrice();

if (!string.IsNullOrEmpty(product.UrlRewrite))
product.UrlRewrite = Helper.UrlRewrite(product.UrlRewrite, "-");

}

private void FinishProduct(Product product)
{
product.HtmlBlocksClean();
product.Price = product.Price.GetTruePrice();

// удаляем аттрибуты тегов в полном описании
if (!string.IsNullOrEmpty(product.FullDescription))
product.FullDescription = product.FullDescription.RemoveTagAttr("id[next]class");

// удаляем аттрибуты тегов в кратком описании
if (!string.IsNullOrEmpty(product.SmallDescription))
product.SmallDescription = product.SmallDescription.RemoveTagAttr("id[next]class");

if (!product.PriceIsOk)
{
_mti.AddLogError("Price: ‘" + product.Price +
"’ does not exist or not configured and will be set 0. Link product: \" " + product.Url +
" \"");
product.Price = "0";
}

/*if (product.ImagesCount > 0)
{
product.SetAttributeValue("SYS_IMAGE_LINK_ALL", Helper.GetAllImageLinksAsString(product.ImagesGet().Keys.ToList<string>(), ","));
product.SetAttributeValue("SYS_IMAGE_NAME_ALL", Helper.GetAllImageNamesAsString(product.ImagesGet().Values.ToList<string>(), ",", "").Trim());
}*/
}

#endregion

}

#region class Helper
public static class Helper
{
public static string HashCod = "";
public static HtmlPageLoader Hpl;
public static Dictionary<string, string> TranslatePairs { get; set; }

public static string GetTruePrice(this string str)
{
str = str.Replace(".", ",");
str = Regex.Replace(str, @"\s*", "");
return str;
}

public static string Translate(this String str, bool wordTranslate = false)
{
if (!wordTranslate)
{
foreach (var word in str.Split(new[] { ‘,’, ‘.’, ‘!’, ‘?’, ‘;’, ‘:’, ‘(‘, ‘)’, ‘"’, ‘[‘, ‘]’, ‘{‘, ‘}’ }).Where(word => !string.IsNullOrEmpty(word)))
str = str.Replace(word, GetTranslate(word));
}

// потом перебираю слова
var text = str.Split(new[] { ‘ ‘ }).Aggregate(string.Empty, (current, word) => current + (" " + GetTranslate(word)));
return text.Trim();
}

private static string GetTranslate(string str)
{
if (TranslatePairs == null)
TranslatePairs = new Dictionary<string, string>();
string trWord;
return TranslatePairs.TryGetValue(str.ToLowerInvariant().Trim(), out trWord) ? trWord : str.Trim();
}

public static List<string> GetListFromBlock(string input, string xpath, string attr)
{
List<string> list;
Hpl.SetContent(input);
TextUtils.ExtractValuesByXpath(xpath, "", Hpl.HtmlDoc, true, out list);
return list;
}

/// <summary>
/// Используем стоп-слово для строки
/// </summary>
public static string StopWord(this String str, string stop)
{
var index = str.IndexOf(stop, StringComparison.Ordinal);
return index <= -1 ? str : str.Substring(0, index);
}

/// <summary>
/// Используем старт-слово для строки
/// </summary>
public static string StartWord(this String str, string stop)
{
var index = str.IndexOf(stop, StringComparison.Ordinal);
return index <= -1 ? str : str.Substring(index + stop.Length);
}

/// <summary>
/// проверка ссылки и доработка
/// </summary>
/// <param name="str"></param>
/// <param name="domen"></param>
/// <returns></returns>
public static string TrueLink(this String str, string domen="")
{
if (str.StartsWith("http")) return str;

if (Regex.IsMatch(str, @"^\/{2}"))
str = "http:" + str;
else
{
str = Regex.Replace(str, @"^\/", "");
domen = Regex.Replace(domen, @"\/$", "");
str = domen + "/" + str;
}
return str;
}

/// <summary>
/// Используем стоп-слово для строки
/// </summary>
public static string LastStopWord(this String str, string stop)
{
var index = str.LastIndexOf(stop, StringComparison.Ordinal);
return index <= -1 ? str : str.Substring(0, index);
}

/// <summary>
/// Используем последнее вхождение старт-слово для строки
/// </summary>
public static string LastStartWord(this String str, string stop)
{
var index = str.LastIndexOf(stop, StringComparison.Ordinal);
return index <= -1 ? str : str.Substring(index + stop.Length);
}

/// <summary>
/// Используем стоп-слово для строки
/// </summary>
public static string RemoveTagAttr(this String str, string replaceItems)
{
return SplitToNextMark(replaceItems).Aggregate(str, (current, item) => Regex.Replace(current, item + @"=(""[^""]*""|'[^’]*’)", ""));
}

/// <summary>
/// Получаем домен из Product или Category
/// </summary>
/// <returns></returns>
public static string GetDomen(Category category)
{
return "http://" + new Uri(category.SourceUrl).Host;
}
public static string GetDomen(Product product)
{
return "http://" + new Uri(product.Url).Host;
}

/// <summary>
/// возвращает хешкод полученых данных,исключает отрицательное число
/// сохраняет внутри класса переменную HashCod
/// </summary>
public static string GetHashCodeString(string item)
{
var hash = item.GetHashCode();

if (hash < 0)
hash = hash * -1;

return hash.ToString(CultureInfo.InvariantCulture);
}

/// <summary>
/// заменяет имя изображения на SKU продукта
/// </summary>
/// <param name="imgHref">имя изображения(надо для получения рассширения изображения)</param>
/// <param name="idx">порядовый номер изображения</param>
/// <returns></returns>
public static string MakeImgName2Sku(string imgHref, int idx)
{
var expanImg = new Regex(@"\.(jpg|png|bmp|gif|jpeg)", RegexOptions.IgnoreCase).Match(imgHref).Value;//list must add
if (string.IsNullOrEmpty(expanImg))
expanImg = ".jpg";

if (idx > 1)
{
return HashCod + "-" + idx.ToString(CultureInfo.InvariantCulture) + expanImg;
}
return HashCod + expanImg;
}

/// <summary>
/// все ссылки на фото в одной строке через разделитель
/// </summary>
/// <param name="listImages"></param>
/// <param name="delimeter">string delimeter</param>
/// <returns>string allImages</returns>
public static string GetAllImageLinksAsString(List<string> listImages, string delimeter)
{
string links = listImages.Aggregate(string.Empty, (current, img) => current + (delimeter + img));
return links.Substring(delimeter.Length);
}

/// <summary>
/// все имена фото в одной строке через разделитель
/// </summary>
/// <param name="listImages"></param>
/// <param name="delimeter"></param>
/// <param name="prefix"></param>
/// <returns></returns>
public static string GetAllImageNamesAsString(List<string> listImages, string delimeter, string prefix)
{
string names = listImages.Aggregate(string.Empty, (current, img) => current + (delimeter + prefix + img));
return names.Substring(delimeter.Length);
}

/// <summary>
/// Делим строку на части по заданому сепаратору
/// </summary>
/// <param name="input">строка ввода</param>
/// <param name="separator"> по умолчанию [next]</param>
/// <returns> коллекция List с результатами </returns>
public static List<string> SplitToNextMark(string input, string separator = "[next]")
{
var parts = input.Split(new[] { separator }, StringSplitOptions.RemoveEmptyEntries).ToList();
return parts;
}

/// <summary>
/// Меняем разделители в стандартном urlrewrite
/// </summary>
/// <param name="str"></param>
/// <param name="delim"></param>
/// <returns></returns>
public static string UrlRewrite(string str, string delim)
{
str = new Regex(@"_+").Replace(AttributableItem.UrlRewriteGet(str), delim);
str = new Regex(string.Format("^{0}|{0}$", delim)).Replace(str, "");
return str;
}

/// <summary>
/// Делим строку на части по заданому сепаратору
/// </summary>
/// <param name="xpath"></param>
/// <param name="separator">по умолчанию [attr]</param>
/// <returns>Одна пара с результатами</returns>
public static KeyValuePair<string, string> GetPairsXpath2Attr(string xpath, string separator = "[attr]")
{
var pair = xpath.Split(new[] { separator }, StringSplitOptions.RemoveEmptyEntries);
return new KeyValuePair<string, string>(pair[0], (pair.Length > 1) ? pair[1] : string.Empty);
}

/// <summary>
/// Получаем имя из ссылки
/// </summary>
/// <param name="href"></param>
/// <returns></returns>
public static string CreateNameFromHref(string href)
{
var name = Regex.Replace(href, @"^[\w\W]*/", string.Empty);
return name;
}

/// <summary>
/// доработаная обертка TextUtils.ExtractValuesByXpath с использованием [next] и [attr]
/// </summary>
/// <param name="htmlDoc">HtmlAgilityPack.HtmlDocument</param>
/// <param name="xpath">"xpath name[attr]xpath attribute [next] path name[attr]xpath attribute"</param>
/// <returns></returns>
public static List<string> ExtractValuesByXpath(HtmlDocument htmlDoc, string xpath)
{
foreach (var pair in SplitToNextMark(xpath).Select(part => GetPairsXpath2Attr(part)))
{
List<string> results;
TextUtils.ExtractValuesByXpath(pair.Key, pair.Value, htmlDoc, true, out results);

if (results != null && results.Count > 0)
return results;
}
return new List<string>(0);
}

/// <summary>
/// доработаная обертка TextUtils.ExtractFirstValuesByXpath с использованием [next] и [attr]
/// </summary>
/// <param name="htmlDoc">HtmlAgilityPack.HtmlDocument</param>
/// <param name="xpath">"xpath name[attr]xpath attribute [next] path name[attr]xpath attribute"</param>
/// <returns>string</returns>
public static string ExtractFirstValueByXpath(HtmlDocument htmlDoc, string xpath)
{
foreach (var pair in SplitToNextMark(xpath).Select(part => GetPairsXpath2Attr(part)))
{
var result = TextUtils.ExtractFirstValueByXpath(pair.Key, pair.Value, htmlDoc);

if (!string.IsNullOrEmpty(result))
return result;
}
return string.Empty;
}

public static void HtmlRestore(this HtmlPageLoader hpl)
{
var restoreHtml = new RestoreHtml(hpl);
hpl.SetContent(restoreHtml.ProcessingLiTag(hpl.Content));
}

class RestoreHtml
{
private readonly HtmlPageLoader _hpl;
public RestoreHtml(HtmlPageLoader hpl)
{
_hpl = hpl;
}

public string ProcessingLiTag(string temp)
{
var text = string.Empty;
_hpl.SetContent(temp);
foreach (var item in ExtractValuesByXpath(_hpl.HtmlDoc, "/li"))
{
temp = item;
while (true)
{
_hpl.SetContent(temp);
var innerData = ExtractFirstValueByXpath(_hpl.HtmlDoc, "/li");

if (!string.IsNullOrEmpty(innerData))
{
var ind = temp.IndexOf(innerData, StringComparison.Ordinal);
if (ind <= -1) break;
var res = temp.Substring(0, ind – 4);
text += "<li>" + res + "</li>";
temp = innerData;
}
else
{
text += "<li>" + temp + "</li>";
break;
}
}
}
return "<ul>" + text + "</ul>";
}
}
}
#endregion

#region class ParametrsProccessing
class ParametrsProcessing : CustomScript
{
private readonly HtmlPageLoader _hpl;
private readonly Product _product;
private bool _translate;
private readonly string _htmlParameters;

public CmsEngine Cms { get; set; }
public List<string> NotTakedName { get; set; }

public ParametrsProcessing(Product product, HtmlPageLoader hpl, string htmlParameters, bool translate = false)
: base(product)
{
_translate = translate;
_htmlParameters = htmlParameters;
_hpl = hpl;
_product = product;
}

/// <summary>
/// Получаем характеристики с помощью xpath
/// </summary>
/// <param name="split">xPath разделитель</param>
/// <param name="name">xPath имя свойства</param>
/// <param name="value">xPath значение свойства</param>
public void GetDinamicAttributesXpath(string split, string name, string value)
{
_hpl.SetContent(_htmlParameters);
List<string> lines;
TextUtils.ExtractValuesByXpath(split, "", _hpl.HtmlDoc, true, out lines);

foreach (var line in lines)
{
_hpl.SetContent(line.Replace("\r", "").Replace("\t", "").Replace("\n", ""));
var newColumnName = Helper.ExtractFirstValueByXpath(_hpl.HtmlDoc, name);
var attributeValue = Helper.ExtractFirstValueByXpath(_hpl.HtmlDoc, value);

if (!string.IsNullOrWhiteSpace(newColumnName) && !string.IsNullOrWhiteSpace(attributeValue))
AddAttibute(newColumnName, attributeValue);
}
}

/// <summary>
/// Получаем характеристики с помощью Regex
/// </summary>
/// <param name="split">Regex разделитель</param>
/// <param name="name">Regex имя свойства</param>
/// <param name="value">Regex значение свойства</param>
public void GetDinamicAttributesRegex(string split, string name, string value)
{
_hpl.SetContent(_htmlParameters);
var lines = _hpl.Content.Split(new [] { split }, StringSplitOptions.RemoveEmptyEntries);
foreach (var line in lines)
{
var val = line.Replace("\r", "").Replace("\t", "").Replace("\n", "");
var columnName = Regex.Match(val, name).Groups[1].Value;
var attributeValue = Regex.Match(val, value).Groups[1].Value;
AddAttibute(columnName, attributeValue);
}
}

private void AddAttibute(string columnName, string attrValue)
{
var regex = new Regex(@"<[^>]*>");
columnName = regex.Replace(columnName, "").Replace(":", "").Replace(";", ",").Replace("\"", "”").Trim();
attrValue = regex.Replace(attrValue, "").Replace("\"", "”").Replace(";", ",").Trim();
if (!CheckNameProperties(columnName) && !string.IsNullOrWhiteSpace(columnName) && !string.IsNullOrWhiteSpace(attrValue))
_product.AddDynamicAttribute(columnName, attrValue);
}

// ReSharper disable once UnusedMember.Local
private void AddParametrsFromProduct()
{
var properties = string.Empty;
if (Cms == CmsEngine.Prestashop)
{
properties = _product.GetDynamicAttributes().Aggregate(properties, (current, property) => current + string.Format(",{0}:{1}", property.Key, _product.GetAttributeValue(property.Value)));
_product.SetAttributeValue("SYS_PROPERTIES_PRESTA", properties.Trim(‘,’));
}

if (Cms != CmsEngine.Advatshop) return;
properties = _product.GetDynamicAttributes().Aggregate(properties, (current, property) => current + string.Format(",{0}:{1}", property.Key, _product.GetAttributeValue(property.Value)));
_product.SetAttributeValue("SYS_PARAMETRS_ADVANTSHOP", properties.Trim(‘,’));
}

private bool CheckNameProperties(string name)
{
if (NotTakedName == null || NotTakedName.Count < 1)
return false;
return NotTakedName.Contains(name);
}

/// <summary>
/// С помощью xpath получаем коллекцию с данными
/// </summary>
/// <param name="xpathSplit"> xpath блока по которому делаем Split</param>
/// <param name="xpathData">xpath параметров , пишем в строку пример "//a[attr]href[next]//div[attr]class"</param>
/// <returns></returns>
public List<List<string>> GetDataFromXpath(string xpathSplit, string xpathData)
{
_hpl.SetContent(_htmlParameters);
var list = new List<List<string>>();
List<string> lines;
TextUtils.ExtractValuesByXpath(xpathSplit, "", _hpl.HtmlDoc, true, out lines);
foreach (var line in lines)
{
var resultLine = new List<string>();
_hpl.SetContent(line);
foreach (var xpath in GetArrayFromDelim("[next]", xpathData))
{
string[] xp = GetArrayFromDelim("[attr]", xpath);
List<string> res;
TextUtils.ExtractValuesByXpath(xp[0], (xp.Length > 1) ? xp[1] : string.Empty, _hpl.HtmlDoc, false, out res);
resultLine.Add(res.Count > 0 ? res[0] : string.Empty);
}
list.Add(resultLine);
}
return list;
}

private string[] GetArrayFromDelim(string delim, string input)
{
return input.Split(new [] { delim }, StringSplitOptions.None);
}
}

enum CmsEngine
{
Prestashop,
Advatshop
}
#endregion

#region class ImageProcessing
public class ImageProcessing : CustomScript
{
private readonly string _imagesBlock;
private readonly bool _imageNameSku;
private readonly string _domen;
private readonly HtmlPageLoader _hpl;
private readonly Product _product;
private int _idx;
private readonly int _countImg;

/// <summary>
/// </summary>
/// <param name="product"></param>
/// <param name="domen"></param>
/// <param name="imagesBlock">block with image code</param>
/// <param name="hpl"></param>
/// <param name="imageNameSku"></param>
public ImageProcessing(Product product, HtmlPageLoader hpl, bool imageNameSku, string domen, string imagesBlock) : this(product, hpl, imageNameSku, domen, imagesBlock, 0) { }

public ImageProcessing(Product product, HtmlPageLoader hpl, bool imageNameSku, string domen, string imagesBlock, int countImg)
: base(product)
{
_product = product;
_hpl = hpl;
_domen = domen;
_imagesBlock = imagesBlock;
_imageNameSku = imageNameSku;
if (countImg > 0)
_countImg = countImg;
_idx = product.ImagesCount + 1;
}

public void GetAdditionalImages(string split, string contains, string reg)
{
var listSource = GetSourceImagesWithRegex(_imagesBlock, split, contains, reg);
if (listSource.Count > 0)
AddImage(listSource);
}

public void GetAdditionalImages(string xpath)
{
_hpl.SetContent(_imagesBlock);
var listSource = Helper.ExtractValuesByXpath(_hpl.HtmlDoc, xpath);
if (listSource.Count > 0)
AddImage(listSource);
}

public static List<string> GetSourceImagesWithRegex(string blockAddImage, string split, string contains, string reg)
{
var parts = blockAddImage.Split(new[] { split }, StringSplitOptions.RemoveEmptyEntries);
return (
from part in parts
where contains != null && !(!string.IsNullOrEmpty(contains) & !part.Contains(contains))
from r in Helper.SplitToNextMark(reg)
select new Regex(r).Match(part).Groups[1].ToString().Trim()
into item
where !string.IsNullOrWhiteSpace(item)
select item
).ToList();
}

private void AddImage(List<string> listSource)
{
if (_countImg > 0 && listSource.Count > _countImg)
RemoveItemToList(ref listSource);

foreach (var item in listSource)
{
if (string.IsNullOrWhiteSpace(item))
continue;
var linkImg = item.TrueLink(_domen);

if (linkImg == _product.ImageFull)
continue;

if (string.IsNullOrEmpty(_product.ImageFull))
_idx = 1;

if (_imageNameSku)
{
_product.ImageAdd(linkImg, GetNameImageFromHref(item, _idx));
_idx++;
}
else
_product.ImageAdd(linkImg);
}
}

private void RemoveItemToList(ref List<string> listSource)
{
listSource.RemoveRange(_countImg, listSource.Count – _countImg);
}

/// <summary>
/// Get expansion image and delegate him MakeImgName2Sku
/// </summary>
/// <param name="imgHref">path to image</param>
/// <param name="idx"></param>
/// <returns>string image from HashCod</returns>
public string GetNameImageFromHref(string imgHref, int idx)
{
return Helper.MakeImgName2Sku(imgHref, idx);
}
}
#endregion

#region class WorksWithFiles

class WorksWithFiles
{
public string PathToDir { get; private set; }
public string PathToFile { get; private set; }
public string NameFile { get; private set; }
public string ExtenFile { get; private set; }

private readonly string[] _lines;
private readonly bool _notFile;

public WorksWithFiles(string file) : this(file, null) { }
public WorksWithFiles(string file, Encoding encoding)
{
GetValueFileVariables(file);
if (ExtenFile != ".xlsx" && ExtenFile != ".xls")
{
if (!File.Exists(PathToFile))
{
_notFile = true;
_lines = new string[0];
}
else
{
_lines = encoding != null ? File.ReadAllLines(PathToFile, encoding) : File.ReadAllLines(PathToFile);
}
}
}

/// <summary>
/// Создаем файл для закачки фото, для старой версии Catalogloader
/// Вызываем функцию для каждой фотки
/// Имя фото создает из ссылки
/// </summary>
/// <param name="path">Путь к файлу</param>
/// <param name="imageLink">ссылка на фото</param>
public static void CreateImageDownloadFile(string path, string imageLink)
{
var text = string.Format("{0};{1}", imageLink, Helper.CreateNameFromHref(imageLink));
CreateTextFile(path, text);
}

/// <summary>
/// Создаем текстовый файл
/// Запись посторочно, при каждом вызове функции записывается одна строка
/// </summary>
/// <param name="path">путь к файлу</param>
/// <param name="text"> текст </param>
public static void CreateTextFile(string path, string text)
{
CreateTextFile(path, text, Encoding.UTF8);
}

/// <summary>
/// Создаем текстовый файл
/// Запись посторочно, при каждом вызове функции записывается одна строка
/// </summary>
/// <param name="path">путь к файлу</param>
/// <param name="text"> текст </param>
/// <param name="encoding"> кодировка файла </param>
public static void CreateTextFile(string path, string text, Encoding encoding)
{
var startupPath = UtilSmall.ApplicationDataDirectory;
path = path.Replace("/", "\\");
if (!path.Contains(startupPath))
{
if (!Regex.IsMatch(path, @"^\\"))
path = "\\" + path;
path = startupPath + path;
}

if (encoding == null)
encoding = Encoding.UTF8;

if (!File.Exists(path))
using (var str = new StreamWriter(new FileStream(path, FileMode.Create), encoding))
{
str.WriteLine(text);
}
else
using (var str = new StreamWriter(new FileStream(path, FileMode.Append), encoding))
{
str.WriteLine(text);
}
}

/// <summary>
/// Содержимое List пишется через ; в одну строку
/// </summary>
/// <param name="path">путь к файлу</param>
/// <param name="texts"> коллекция с данными</param>
public static void CreateTextFile(string path, List<string> texts)
{
CreateTextFile(path, texts, Encoding.UTF8);
}

/// <summary>
/// Содержимое List пишется через ; в одну строку
/// </summary>
/// <param name="path">путь к файлу</param>
/// <param name="texts"> коллекция с данными</param>
/// <param name="encoding"> кодировка файла</param>
public static void CreateTextFile(string path, List<string> texts, Encoding encoding)
{
var text = texts.Aggregate(string.Empty, (current, t) => current + (";" + t.Trim()));
text = text.Substring(1);
CreateTextFile(path, text, encoding);
}

/// <summary>
/// Создаеем коллекцию из файла xslx
/// </summary>
/// <returns></returns>
public Dictionary<string, string> CreateCollectionFromExcel()
{
if (_notFile || ExtenFile != ".xlsx")
return new Dictionary<string, string>(0);

var dic = new Dictionary<string, string>();
var sheet = GetParseXls();
for (var i = 0; i <= sheet.LastRowNum; i++)
{
try
{
dic[sheet.GetRow(i).GetCell(0).ToString().ToLowerInvariant().Trim()] = sheet.GetRow(i).GetCell(1).ToString().ToLowerInvariant().Trim();
}
catch
{
break;
}
}
return dic;
}

/// <summary>
/// Из файла с 2 колонками , создаем коллекцию
/// </summary>
/// <param name="delimeter"> разделитель колонок из файла </param>
/// <param name="reverse">если хотим использовать вторую колонку как ключ коллекции ставим true</param>
/// <param name="valueString"> false – значения пар с одинаковыми ключами пропускаем, true- пишем их в строку с разделителем ,</param>
public Dictionary<string, string> CreateCollection(string delimeter, bool reverse, bool valueString = false)
{
if (_notFile)
return new Dictionary<string, string>(0);

var dic = new Dictionary<string, string>();
foreach (var line in _lines)
{
var parts = line.Split(new [] { delimeter }, StringSplitOptions.RemoveEmptyEntries);
if (parts.Length <= 1) continue;
var name = parts[0].Replace("\"", string.Empty).ToLowerInvariant().Trim();
var val = parts[1].Replace("\"", string.Empty).ToLowerInvariant().Trim();
var searchN = name;
var searchV = val;
if (reverse)
{
searchV = name;
searchN = val;
}

string res;
if (!dic.TryGetValue(searchN, out res))
dic.Add(searchN, searchV);
else
if (valueString)
dic[searchN] = res + "," + searchV;
}
return dic;
}

/// <summary>
/// Создаем коллекцию с ключем и List как значение
/// </summary>
/// <param name="delimeter">разделитель из файлв</param>
/// <param name="columnIndex">номер колонки которую используем как значение</param>
/// <returns></returns>
public Dictionary<string, List<string>> CreateCollection(string delimeter, int columnIndex = 0)
{
if (_notFile)
return new Dictionary<string, List<string>>(0);

var dic = new Dictionary<string, List<string>>();
foreach (var line in _lines)
{
var list = new List<string>();
var parts = line.Split(new[] { delimeter }, StringSplitOptions.RemoveEmptyEntries);

if (parts.Length < 1 || parts.Length – 1 < columnIndex)
continue;

if(string.IsNullOrEmpty(parts[columnIndex]))
continue;

for (var i = 0; i < parts.Length; i++)
{
if (i == columnIndex)
continue;

var part = parts[i].Replace("\"", string.Empty).Trim();
list.Add(!string.IsNullOrWhiteSpace(part) && !list.Contains(part) ? part : string.Empty);
}
var name = parts[columnIndex].Replace("\"", string.Empty).Trim();

List<string> res;
if (!dic.TryGetValue(name, out res))
dic.Add(name, list);
}
return dic;
}

/// <summary>
/// коллекция значений из файла
/// </summary>
/// <returns></returns>
public List<string> CreateCollection()
{
if (_notFile)
return new List<string>(0);

var list = new List<string>();
foreach (var l in _lines.Select(line => line.Replace("\"", string.Empty).Trim()).Where(l => !list.Contains(l)))
{
list.Add(l);
}
return list;
}

private void GetValueFileVariables(string file)
{
PathToFile = file;
NameFile = Path.GetFileName(file);
PathToDir = Path.GetDirectoryName(file);
ExtenFile = Path.GetExtension(file);
}

/// <summary>
/// Закачиваем файл
/// </summary>
/// <param name="linkFile">ссылка на файл</param>
public void DownloadFile(string linkFile)
{
CreateDirs();
var wc = new WebClient();
wc.DownloadFile(linkFile, PathToFile);
}

/// <summary>
/// Создаем не существующую директорию
/// </summary>
private void CreateDirs()
{
if (!Directory.Exists(PathToDir))
Directory.CreateDirectory(PathToDir);
}

/// <summary>
/// Парсим екселевский файл
/// </summary>
/// <returns></returns>
private NPOI.SS.UserModel.ISheet GetParseXls()
{
if (ExtenFile == ".xlsx")
{
NPOI.XSSF.UserModel.XSSFWorkbook workbook;
using (var file = new FileStream(PathToFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
workbook = new NPOI.XSSF.UserModel.XSSFWorkbook(file);
}
return workbook.GetSheetAt(0);
}
else
{
NPOI.HSSF.UserModel.HSSFWorkbook workbook;
using (var file = new FileStream(PathToFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
workbook = new NPOI.HSSF.UserModel.HSSFWorkbook(file);
}
return workbook.GetSheetAt(0);
}
}
}

#endregion

#region CreateGrabCatalog

public class CreateGrabCatalog : CustomScript
{
private readonly Category _mroot;
private readonly HtmlPageLoader _hplCatalog;
private readonly GrabCatalogBuildScriptParameters _p;
private readonly string _domen;

public CreateGrabCatalog(Category mroot, HtmlPageLoader hplCatalog, GrabCatalogBuildScriptParameters p, string domen)
: base(mroot)
{
_domen = domen;
_mroot = mroot;
_hplCatalog = hplCatalog;
_p = p;
}

/// <summary>
/// добаляем категорию в грабкаталог
/// </summary>
/// <param name="lCat"></param>
/// <param name="id">ID категории родителя</param>
/// <returns></returns>
public void AddCategories(List<Category> lCat, string id)
{
foreach (var c in lCat)
{
AddCategory(c, id);
}
}

public void AddCategory(Category c, string id)
{
var root = _mroot.FindCategoryById(id);

if (c.SourceUrl == "#")
{
_p.Process.m_ti.AddLogError("The Category didn’t add : url category = " + c.SourceUrl );
return;
}

c.SourceUrl = c.SourceUrl.TrueLink(_domen);

var cat = root.AddCategory(c.Name, true);
cat.SourceUrl = c.SourceUrl;
c.ID = cat.ID;
_p.Process.m_ti.AddLogInfo("Added category level " + cat.Level + " ‘" + cat.GetFullName() + "’, url " + cat.SourceUrl);
}

/// <summary>
/// Собираем ссылки со страницы где имя категории text(), а ссылка атрибут
/// </summary>
/// <param name="xpathName">xpath[attr]attr </param>
/// <param name="xpathLink">xpath[attr]attr </param>
/// <param name="root">категория родитель</param>
/// <param name="htmlPage"></param>
/// <returns></returns>
public List<Category> CreateListCategory(string xpathName, string xpathLink, Category root, string htmlPage = null)
{
if (htmlPage == null)
_hplCatalog.Load(root.SourceUrl);
else
_hplCatalog.SetContent(htmlPage);

var lc = new List<Category>();

var name = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathName);
var link = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathLink);

if (name.Count != link.Count || link.Count <= 0) return lc;
lc.AddRange(name.Select((t, i) => new Category {Name = Regex.Replace(t, @"<[^>]*>", "").Trim(), SourceUrl = link[i]}));
return lc;
}

/// <summary>
/// Берем категории с одной страницы
/// </summary>
/// <param name="html"></param>
/// <param name="xpathBlock"></param>
/// <param name="xpathName"></param>
/// <param name="xpathLink"></param>
/// <param name="root"></param>
/// <returns></returns>
public Dictionary<Category, string> CreateDictionaryCategoryOnePage(string html, string xpathBlock, string xpathName, string xpathLink, Category root)
{
_hplCatalog.SetContent(html);
var lc = new Dictionary<Category, string>();

var name = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathName);
var link = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathLink);
var blockHtml = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathBlock);

if (name.Count <= 0) return lc;
for (var a = 0; a < name.Count; a++)
{
var cat = new Category {Name = name[a].Trim(), SourceUrl = (link.Count == name.Count) ? link[a] : _domen};
lc.Add(cat, (blockHtml.Count > 0) ? blockHtml[a] : string.Empty);
}
return lc;
}

public Dictionary<Category, string> CreateDictionaryCategoryOnePage(string html, string xpathBlockAll, string xpathBlock, string xpathName, string xpathLink, Category root)
{
_hplCatalog.SetContent(html);
var lc = new Dictionary<Category, string>();
var blocks = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathBlockAll);
foreach (var block in blocks)
{
_hplCatalog.SetContent(block);
var name = Helper.ExtractFirstValueByXpath(_hplCatalog.HtmlDoc, xpathName);
var link = Helper.ExtractFirstValueByXpath(_hplCatalog.HtmlDoc, xpathLink);
var blockHtml = Helper.ExtractFirstValueByXpath(_hplCatalog.HtmlDoc, xpathBlock);
if(string.IsNullOrEmpty(name)) continue;
var cat = new Category { Name = name, SourceUrl = !string.IsNullOrEmpty(link) ? link : _domen};
lc.Add(cat, !string.IsNullOrEmpty(blockHtml) ? blockHtml : string.Empty);
}
return lc;
}
}

#endregion

#region HtmlData
public class HtmlData
{
public string Text { get; set; }
public string Value { get; set; }
public string Title { get; set; }
public string Alt { get; set; }
public string Style { get; set; }
public string Name { get; set; }
public string Html { get; set; }
public string Href { get; set; }
public string Id { get; set; }
public string Src { get; set; }
public string Class { get; set; }
public string OnClick { get; set; }

public Dictionary<string, string> Attr2Value { get; set; }

public HtmlData()
{
Attr2Value = new Dictionary<string, string>();
Text = Value = Style = Name = Html = Href = Id = Src = Class = OnClick = Title = Alt= string.Empty;
}

public double ToDouble(string str = "")
{
if (!string.IsNullOrEmpty(str) && !Attr2Value.ContainsKey(str))
new Exception("This attribute is not found");

str = string.IsNullOrEmpty(str) ? Text : Attr2Value[str];
str = Regex.Replace(str, @"\r|\t|\n|\s", "").Trim();
str = Regex.Match(str, @"(\d+,*\d*)").Groups[1].Value.Trim(‘,’);
double res;
if (double.TryParse(str, out res))
return res;
new Exception("This format is not suitable");
return 0;
}

public int ToInt32(string str = "")
{
if (!string.IsNullOrEmpty(str) && !Attr2Value.ContainsKey(str))
new Exception("This attribute is not found");

str = string.IsNullOrEmpty(str) ? Text : Attr2Value[str];
str = Regex.Replace(str, @"\r|\t|\n|\s", "").Replace(",", ".").Trim();
str = Regex.Match(str, @"(\d+)").Groups[1].Value.Trim(‘,’);
int res;
if (int.TryParse(str, out res))
return res;
new Exception("This format is not suitable");
return 0;
}

/// <summary>
/// Extact HtmlData object
/// </summary>
/// <param name="hpl"></param>
/// <param name="xpath">not [attr]</param>
/// <returns></returns>
public static HtmlData ExtractFirstHtmlData(HtmlPageLoader hpl, string xpath)
{
return Helper.SplitToNextMark(xpath).Select(item => hpl.HtmlDoc.DocumentNode.SelectSingleNode(item)).Select(GetHtmlData).FirstOrDefault(hd => hd != null);
}

/// <summary>
/// Extact HtmlData object
/// </summary>
/// <param name="hpl"></param>
/// <param name="xpath">not [attr]</param>
/// <returns></returns>
public static List<HtmlData> ExtractListHtmlData(HtmlPageLoader hpl, string xpath)
{
var res = new List<HtmlData>();
foreach (var xp in Helper.SplitToNextMark(xpath))
{
var htmlNodes = hpl.HtmlDoc.DocumentNode.SelectNodes(xp);

if (htmlNodes == null) continue;

res.AddRange(htmlNodes.Select(GetHtmlData).Where(htmlData => htmlData != null));
if (res.Count > 0)
break;
}
return res;
//return (from htmlNodes in Helper.SplitToNextMark(xpath).Select(item => hpl.HtmlDoc.DocumentNode.SelectNodes(item)) from htmlNode in htmlNodes select GetHtmlData(htmlNode)).ToList();
}

private static HtmlData GetHtmlData(HtmlNode htmlNode)
{
if (htmlNode == null)
return new HtmlData();

var hd = new HtmlData
{
Text = HttpUtility.HtmlDecode(htmlNode.InnerText),
Html = HttpUtility.HtmlDecode(htmlNode.InnerHtml)
};

foreach (var pair in htmlNode.Attributes)
{
hd.Attr2Value.Add(pair.Name, pair.Value);
var value = HttpUtility.HtmlDecode(pair.Value);
switch (pair.Name)
{
case "style":
hd.Style = value;
break;
case "name":
hd.Name = value;
break;
case "value":
hd.Value = value;
break;
case "href":
hd.Href = value;
break;
case "src":
hd.Src = value;
break;
case "id":
hd.Id = value;
break;
case "class":
hd.Class = value;
break;
case "onclick":
hd.OnClick = value;
break;
case "title":
hd.Title = value;
break;
case "alt":
hd.Alt = value;
break;
}
}
return hd;
}

}
#endregion
}
[/code]