using System; using System.Collections.Generic; using System.Drawing; using System.Globalization; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Web; using CatalogLoader; using CatalogLoaderCommon; using HtmlAgilityPack; using Newtonsoft.Json; using Newtonsoft.Json.Linq; //css_reference HtmlAgilityPack; //css_reference NPOI.dll //css_reference NPOI.OOXML.dll //css_reference NPOI.OpenXml4Net.dll //css_reference NPOI.OpenXmlFormats.dll //css_reference ICSharpCode.SharpZipLib.dll namespace CatalogLoaderVSScriptEditor { public class CustomScript : CustomScriptBase { public HtmlPageLoader HplProduct { get; set; } public HtmlPageLoader HplCatalog { get; set; } public HtmlPageLoader HplLinks { get; set; } private bool _imageNameSku; private string _hashCod; private string _domen; private Product _product; private Category _category; private TaskInfo _mti; string _domain = "https://www.walgreens.com"; private GrabProcessState _gps; public CustomScript() { } public CustomScript(Category mroot) { _category = mroot; } public CustomScript(Product product) { _product = product; } List<string> _upcList = new List<string>(); public override void Login(LoginScriptParameters p) { _gps = p.State as GrabProcessState; _mti = p.Process.m_ti; string inputFilePath = Path.Combine(UtilSmall.ApplicationDataDirectory, "InputFile.txt");//_gps.GrabberSettings.UserParameterGet("InputFile"); FromFileRead(inputFilePath); //var client = new WebClientWithCookies(); //const string loginUrl = @""; //client.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); //client.Headers.Add("Accept-Language", "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3"); //client.Headers.Add("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0"); //client.Headers.Add("Content-Type", "application/x-www-form-urlencoded"); //client.Headers.Add("Host", new Uri(loginUrl).Host); //client.Headers.Add("Referer", loginUrl); //client.Encoding = Encoding.UTF8; //const string data = @""; //var str = client.UploadString(loginUrl, "POST", data); //if (str.Contains("logout")) //{ // p.SessionCookieCollection = client.SessionCookieContainer.GetCookies(new Uri(loginUrl)); //} //else //{ // throw new Exception("was not logged in ..."); //} } public void FromFileRead(string filePath) { _upcList = File.ReadAllLines(filePath).ToList(); } public override void GrabCatalogBuild(GrabCatalogBuildScriptParameters p) { //Init(p); _mti = p.Process.m_ti; _category = new Category(); var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty(); hpl.Load(_domain); var script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(),'__HEADER_INITIAL_STATE__')]", "", false); var regex = new Regex(@"{[\w\W]+}};"); script = regex.Match(script).Value.TrimEnd(';'); var jobjectScript = JObject.Parse(script); if(jobjectScript.SelectToken("$.header.headNavData.menu-shop-products.categories") != null) { foreach(var cat in jobjectScript.SelectToken("$.header.headNavData.menu-shop-products.categories")) { var category = new Category(); category.Name = cat.SelectToken("$.name").ToString(); category.SourceUrl = _domain + cat.SelectToken("$.url").ToString(); _category.AddCategory(category); if(cat.SelectToken("$.categories") != null) { SubCategoryAdd(cat.SelectToken("$.categories"), category); } } } /* List<string> Link = new List<string>(); List<string> Name = new List<string>(); var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty(); var CurrentUrl = _domain; hpl.Load(CurrentUrl); TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[@class=\"menu\"]//li[position() >1 and position() < 7]/a[@class=\"level-top\"]/span", "", false, 0, true, out Name); TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[@class=\"menu\"]//li[position() >1 and position() < 7]/a[@class=\"level-top\"]", "href", true, 0, true, out Link); var category = new Category(); if (Link.Count != 0 && Link.Count == Name.Count) { for (int i = 0; i < Link.Count; i++) { category.Name = Name[i]; category.SourceUrl = Link[i]; SubCategoryAdd(category, hpl, ""); _category.AddCategory(category); } }*/ p.Root = _category; } public void SubCategoryAdd(JToken script, Category parent) { foreach(var cat in script) { var category = new Category(); category.Name = cat.SelectToken("$.name").ToString(); category.SourceUrl = _domain + cat.SelectToken("$.url").ToString(); parent.AddCategory(category); if(cat.SelectToken("$.categories") != null) { SubCategoryAdd(cat.SelectToken("$.categories"), category); } } } public override void ProcessFinished(ProcessFinishedScriptParameters p) { } public override void RunCategory(RunCategoryScriptParameters p) { } public override void GetProductLinksForCategory(GetProductLinksForCategoryScriptParameters p) { // Init(p); // _mti = new TaskInfo(); //https://www.walgreens.com/search/results.jsp?Ntt=300411848 var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty(); foreach (var upc in _upcList) { _mti.AddLogInfo("Upc:" + upc); hpl.Load("https://www.walgreens.com/search/results.jsp?Ntt=" + upc); //product-container var productLink = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//ul[@class='product-container']/li[@class='item card card__product']//a", "href", true); p.Category.ProductLinks.Add(_domain + productLink); _mti.AddLogInfo("product link count:" + p.Category.ProductLinks.Count()); } //var CurrentUrl = p.Category.SourceUrl; // hpl.Load(CurrentUrl); // /*if (hpl.Content.Contains("")) // { // hpl.Load(CurrentUrl); // } // */ // var regex = new Regex(@"[0-9]{3,}"); // var N = regex.Match(p.Category.SourceUrl).Value; // var productCount = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//p[@id='resultcount']/strong", "", false); // if(string.IsNullOrEmpty(productCount)) // { // productCount = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[@id='resultcount']/strong", "", false); // } // while (true) // { // if(!_mti.CanContinue()) // { // break; // } // var script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(), 'window.__APP_INITIAL_STATE__')]", "", false).Replace("window.__APP_INITIAL_STATE__ = ", "").TrimEnd(';'); // if(string.IsNullOrEmpty(script)) // { // script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(), 'window.getInitialState=function')]", "", false).Replace("window.getInitialState=function(){ return ", "").TrimEnd(';'); // } // script = script.TrimEnd('}') + "}}"; // var jObjectScript = JObject.Parse(script); // if (jObjectScript.SelectTokens("$.searchResult.productList") != null || jObjectScript.SelectTokens("$.searchResult.productList").Count() != 0) // { // foreach (var productList in jObjectScript.SelectToken("$.searchResult.productList")) // { // p.Category.ProductLinks.Add(_domain + productList.SelectToken("$.productInfo.productURL").ToString()); // if (!_gps.GrabberSettings.Settings.IsMaxProductsInCategoryNull() && _gps.GrabberSettings.Settings.MaxProductsInCategory > 0 && p.Category.ProductLinks.Count >= _gps.GrabberSettings.Settings.MaxProductsInCategory) // { // goto OUT_OF_FUNC; // } // } // } // else // { // break; // } // if (Convert.ToInt32(productCount) > 80) // { // CurrentUrl = "https://www.walgreens.com/store/store/category/productlist.jsp?webExc=true&N=" + N + "&No=" + p.Category.ProductLinks.Count(); // if (!hpl.Load(CurrentUrl)) // { // break; // } // } // else // { // break; // } // if (p.Category.ProductLinks.Count() >= Convert.ToInt32(productCount)) // { // break; // } // } //OUT_OF_FUNC: // var k = ""; } public override void RunProduct(RunProductScriptParameters p) { p.Product.BarCode = p.Product.Url.Split(new string[] { "Ntt=" }, StringSplitOptions.None).Last(); string categoryPath = p.Product.GetAttributeValue("CATEGORY_PATH"); if(!string.IsNullOrEmpty(categoryPath)) { var hDoc = new HtmlDocument(); hDoc.LoadHtml(categoryPath); var node = hDoc.DocumentNode.SelectNodes("./li"); List<string> categoryLinks = new List<string>(); List<string> categoryName = new List<string>(); foreach(var n in node) { categoryLinks.Add("https://#"); categoryName.Add(n.SelectSingleNode("./a").InnerHtml); } var c = new Category(); var cLast = CategoriesAdd(0, categoryName, categoryLinks, c); p.Product.Category = cLast; } var k = ""; //string imagesHtml = p.Product.GetAttributeValue("IMAGES_HTML"); //var script = p.Product.GetAttributeValue("Script"); // string productData = p.Product.GetAttributeValue("PRODUCT_DATA_HTML"); // string script = p.Product.GetAttributeValue("SCRIPT_HTML"); // var size = p.Product.GetAttributeValue("SIZE_HTML"); // string inStock = p.Product.GetAttributeValue("IN_STOCK"); // string productFullSize = p.Product.GetAttributeValue("PRODUCT_FULL_SIZE"); // var art = p.Product.Art; // if(art.Contains("react-text")) // { // var regex = new Regex(@"[0-9]{3,}"); // art = regex.Match(art).Value; // p.Product.Art = art; // } ///* var price = p.Product.Price; // if(price.Contains("<sup>")) // { // var regex = new Regex(@"[0-9]+"); // var pr = regex.Matches(price); // p.Product.Price = pr[0].ToString() + "." + pr[1].ToString(); // }*/ // var weight = p.Product.Weight; // if(weight.Contains("react-text")) // { // var regex = new Regex("-->"+@"[0-9]+\W*[0-9]*"); // var NewWeight = regex.Match(weight).Value.Replace("-->",""); // p.Product.Weight = NewWeight; // } //// art = art. //if(!string.IsNullOrEmpty(productFullSize)) // { // p.Product.AddDynamicAttribute("Size of the package", productFullSize); // } // var name = p.Product.Name; // var hDoc = new HtmlDocument(); ///* if(!string.IsNullOrEmpty(imagesHtml)) // { // hDoc.LoadHtml(imagesHtml); // ImageClp imageClp; // List<ImageClp> images = new List<ImageClp>(); // List<string> LinksToImages; // TextUtils.GetHtmlValue(hDoc, "//img", "src", true, 0, true, out LinksToImages); // foreach (var link in LinksToImages) // { // imageClp = new ImageClp(); // if(LinksToImages.Count == 1) // { // imageClp.Url = link; // } // else // { // imageClp.Url = "https:" + link.Replace("100", "900"); // } // images.Add(imageClp); // } // p.Product.ImageSafeAdd(images); // } // else//інколи не загружається фотограція через v4 пробую по іншому. // { // var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty(); // var CurrentUrl = p.Product.Url; // hpl.Load(CurrentUrl); // ImageClp imageClp; // List<ImageClp> images = new List<ImageClp>(); // List<string> LinksToImages = new List<string>(); // TextUtils.GetHtmlValue(hpl.HtmlDoc, "//ul[@id='thumbnailImages']//img", "src", true, 0, true, out LinksToImages); // if(LinksToImages.Count == 0) // { // TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[@id='zoomLensContainer']//img", "src", true, 0, true, out LinksToImages); // } // foreach (var link in LinksToImages) // { // imageClp = new ImageClp(); // if (LinksToImages.Count == 1) // { // imageClp.Url = link; // } // else // { // imageClp.Url = "https:" + link.Replace("100", "900"); // } // images.Add(imageClp); // } // p.Product.ImageSafeAdd(images); // }*/ // if(!string.IsNullOrEmpty(inStock)) // { // p.Product.Quantity = "1"; // } // else // { // p.Product.Quantity = "15"; // } // if(!string.IsNullOrEmpty(size)) // { // if (size.Contains("react-text")) // { // var regex = new Regex(@"-->\d+[\w\W]+"); // size = regex.Match(size).Value.Replace("-->", ""); // var sizes1 = size.Split('x'); // p.Product.Width = sizes1[1]; // p.Product.Height = sizes1[2]; // p.Product.Depth = sizes1[0]; // } // else // { // var sizes = size.Split('x'); // p.Product.Width = sizes[1]; // p.Product.Height = sizes[2]; // p.Product.Depth = sizes[0]; // } // } // if (!string.IsNullOrEmpty(script)) // { // /*var regex = new Regex(@"{[\w\W]+}}};"); // script = regex.Match(script).Value.TrimEnd(';'); // regex = new Regex("%3" + @"[\w\W]+%20"); // var description = regex.Match(script).Value; // var jObjectScript = JObject.Parse(script);*/ // /*if (jObjectScript.SelectToken("$.product.results.prodDetails.section.[0].description.productDesc") != null) // { // var Descriptions = jObjectScript.SelectToken("$.product.results.prodDetails.section.[0].description.productDesc").ToString(); // Descriptions = System.Uri.UnescapeDataString(Descriptions); // p.Product.FullDescription = Descriptions;*/ // var regex = new Regex("\"productDesc\":\"" + @"[\w\W]+" + "\",\"quickView\""); // string Descriptions = regex.Match(script).Value; // if(!string.IsNullOrEmpty(Descriptions)) // { // script = script.Replace(Descriptions.Replace("\"quickView\"", ""), ""); // } // //regex = new Regex(",{\"warnings\"" + @"[\w\W]+" + "\"}}"); // //var tmp = regex.Match(script).Value; // regex = new Regex(@"{[\w\W]+}}};"); // script = regex.Match(script).Value.TrimEnd(';'); // regex = new Regex("\"product\"" + @"[\w\W]+" + ",\"shippingOverlay\""); // script = "{" + regex.Match(script).Value.Replace(",\"shippingOverlay\"", "") + "}"; // regex = new Regex(",{\"warnings\"" + @"[\w\W]+" + ",{\"shipping\""); // var tmp = regex.Match(script).Value.Replace(",{\"shipping\"", ""); // if (!string.IsNullOrEmpty(tmp)) // { // script = script.Replace(tmp, ""); // } // regex = new Regex("\"description\":{\"" + @"[\w\W]+" + "}]},\"productInfo\""); // tmp = regex.Match(script).Value.Replace("}]},\"productInfo\"", ""); // if (!string.IsNullOrEmpty(tmp)) // { // script = script.Replace(tmp, ""); // } // var jObjectScript = JObject.Parse(script); // if (jObjectScript.SelectToken("$.product.results.priceInfo.salePrice") != null) // { // p.Product.Price = jObjectScript.SelectToken("$.product.results.priceInfo.salePrice").ToString().Replace("$", ""); // p.Product.PriceOld = jObjectScript.SelectToken("$.product.results.priceInfo.regularPrice").ToString().Replace("$",""); // } // else // { // p.Product.Price = jObjectScript.SelectToken("$.product.results.priceInfo.regularPrice").ToString().Replace("$",""); // } // if(jObjectScript.SelectToken("$.product.results.productInfo.filmStripUrl") != null) // { // int counter = 1; // foreach(var images in jObjectScript.SelectToken("$.product.results.productInfo.filmStripUrl")) // { // if(images.SelectToken("$.zoomImageUrl" + counter)!= null) // { // p.Product.ImageAdd("https:" +images.SelectToken("$.zoomImageUrl" + counter).ToString()); // } // else if(images.SelectToken("$.largeImageUrl" + counter) != null) // { // p.Product.ImageAdd("https:" + images.SelectToken("$.largeImageUrl" + counter).ToString()); // } // counter++; // } // } // var combinationJObject = jObjectScript.SelectToken("$.product.results.inventory.relatedProducts"); // List<Combination> ProductCombination = new List<Combination>(); // if (combinationJObject.Count() > 0) // { // foreach(var com in combinationJObject) // { // JProperty jProperty = com.ToObject<JProperty>(); // foreach (var comData in com) // { // foreach(var data in comData ) // { // var combination = new Combination(); // var k = data.SelectToken("$.value").ToString(); // if (data.SelectToken("$.value") != null) // { // combination.AddDynamicAttribute(jProperty.Name.ToString(), data.SelectToken("$.value").ToString()); // } // combination.Price = data.SelectToken("$.priceInfo.regularPrice").ToString().TrimStart('$'); // if (data.SelectToken("$.isavlbl").ToString() == "yes") // { // combination.Quantity = "10"; // } // else // { // combination.Quantity = "1"; // } // if (data.SelectToken("$.key") != null) // { // combination.Art = data.SelectToken("$.key").ToString().Replace("sku", ""); // } // ProductCombination.Add(combination); // } // } // } // } // p.Product.CombinationsAdd(ProductCombination); // if (!string.IsNullOrEmpty(Descriptions)) // { // Descriptions = Descriptions.Replace("\"productDesc\":\"","").Replace("\",\"quickView\"", ""); // } // Descriptions = System.Uri.UnescapeDataString(Descriptions); // p.Product.FullDescription = Descriptions; // if (!string.IsNullOrEmpty(productData)) // { // hDoc.LoadHtml(productData); // var text = TextUtils.GetHtmlValue(hDoc, "//li[@id='Ingredients']", "", false); // if(!string.IsNullOrEmpty(text)) // { // p.Product.FullDescription += text; // } // text = TextUtils.GetHtmlValue(hDoc, "//li[@id='Warnings']", "", false); // if(!string.IsNullOrEmpty(text)) // { // p.Product.FullDescription +=text; // } // } // //} // } } public Category CategoriesAdd(int index, List<string> CategoriesNames, List<string> CategoriesLinks, Category parent) { //var last = new Category(); var category = new Category(); if (index < CategoriesLinks.Count()) { category.Name = CategoriesNames[index]; category.SourceUrl = CategoriesLinks[index]; parent.AddCategory(category); index++; var last = CategoriesAdd(index, CategoriesNames, CategoriesLinks, category); return last; } else { return parent; } } #region Settings private void Init(RunProductScriptParameters p) { _mti = p.Process.m_ti; _product = p.Product; _category = p.Category; if (HplProduct != null) return; var hpl = p.Process as OneProductLoader; if (string.IsNullOrEmpty(_domen)) _domen = hpl.State.GrabberSettings.Settings.ShopUrl; if (hpl != null) HplProduct = Helper.Hpl = hpl.State.Proxy.GetHtmlPageLoader(_domen); } private void Init(GetProductLinksForCategoryScriptParameters p) { _mti = p.Process.m_ti; _category = p.Category; if (HplLinks != null) return; var hpl = p.Process as OneCategoryLoader; if (string.IsNullOrEmpty(_domen)) _domen = hpl.State.GrabberSettings.Settings.ShopUrl; if (hpl != null) HplLinks = hpl.State.Proxy.GetHtmlPageLoader(_domen); } private void Init(GrabCatalogBuildScriptParameters p) { _mti = p.Process.m_ti; _category = new Category { ID = "0" }; if (HplCatalog != null) return; var hpl = p.Process as GrabCatalogFromWeb; if (string.IsNullOrEmpty(_domen)) _domen = hpl.State.GrabberSettings.Settings.ShopUrl; if (hpl != null) HplCatalog = hpl.State.Proxy.GetHtmlPageLoader(_domen); } private void StartProduct(Product product) { product.Name = HttpUtility.HtmlDecode(product.Name); _hashCod = Helper.HashCod = Helper.GetHashCodeString(product.Name + product.Url); if (!string.IsNullOrEmpty(product.FullDescription)) { product.FullDescription = product.FullDescription.RemoveTagAttr("id[next]class"); product.FullDescription = Regex.Replace(product.FullDescription, @"\r|\n|\t|\s{2,}", ""); } // удаляем аттрибуты тегов в кратком описании if (!string.IsNullOrEmpty(product.SmallDescription)) { product.SmallDescription = product.SmallDescription.RemoveTagAttr("id[next]class"); product.SmallDescription = Regex.Replace(product.SmallDescription, @"\r|\n|\t|\s{2,}", ""); } if (string.IsNullOrEmpty(product.ID)) product.ID = _hashCod; if (string.IsNullOrEmpty(product.GetAttributeValue("SYS_PRODUCT_ART"))) product.SetAttributeValue("SYS_PRODUCT_ART", product.ID); product.ImageAdd(string.IsNullOrEmpty(product.GetAttributeValue("SYS_PRODUCT_MAIN_IMAGE")) ? product.GetAttributeValue("SYS_PRODUCT_MAIN_IMAGE_SMALL") : product.GetAttributeValue("SYS_PRODUCT_MAIN_IMAGE")); if (_imageNameSku) product.Image = Helper.MakeImgName2Sku(product.ImageFull, 0); product.Price = product.Price.GetTruePrice(); if (!string.IsNullOrEmpty(product.UrlRewrite)) product.UrlRewrite = Helper.UrlRewrite(product.UrlRewrite, "-"); } private void FinishProduct(Product product) { product.HtmlBlocksClean(); product.Price = product.Price.GetTruePrice(); // удаляем аттрибуты тегов в полном описании if (!string.IsNullOrEmpty(product.FullDescription)) product.FullDescription = product.FullDescription.RemoveTagAttr("id[next]class"); // удаляем аттрибуты тегов в кратком описании if (!string.IsNullOrEmpty(product.SmallDescription)) product.SmallDescription = product.SmallDescription.RemoveTagAttr("id[next]class"); if (!product.PriceIsOk) { _mti.AddLogError("Price: '" + product.Price + "' does not exist or not configured and will be set 0. Link product: \" " + product.Url + " \""); product.Price = "0"; } /*if (product.ImagesCount > 0) { product.SetAttributeValue("SYS_IMAGE_LINK_ALL", Helper.GetAllImageLinksAsString(product.ImagesGet().Keys.ToList<string>(), ",")); product.SetAttributeValue("SYS_IMAGE_NAME_ALL", Helper.GetAllImageNamesAsString(product.ImagesGet().Values.ToList<string>(), ",", "").Trim()); }*/ } #endregion } #region class Helper public static class Helper { public static string HashCod = ""; public static HtmlPageLoader Hpl; public static Dictionary<string, string> TranslatePairs { get; set; } public static string GetTruePrice(this string str) { str = str.Replace(".", ","); str = Regex.Replace(str, @"\s*", ""); return str; } public static string Translate(this String str, bool wordTranslate = false) { if (!wordTranslate) { foreach (var word in str.Split(new[] { ',', '.', '!', '?', ';', ':', '(', ')', '"', '[', ']', '{', '}' }).Where(word => !string.IsNullOrEmpty(word))) str = str.Replace(word, GetTranslate(word)); } // потом перебираю слова var text = str.Split(new[] { ' ' }).Aggregate(string.Empty, (current, word) => current + (" " + GetTranslate(word))); return text.Trim(); } private static string GetTranslate(string str) { if (TranslatePairs == null) TranslatePairs = new Dictionary<string, string>(); string trWord; return TranslatePairs.TryGetValue(str.ToLowerInvariant().Trim(), out trWord) ? trWord : str.Trim(); } public static List<string> GetListFromBlock(string input, string xpath, string attr) { List<string> list; Hpl.SetContent(input); TextUtils.ExtractValuesByXpath(xpath, "", Hpl.HtmlDoc, true, out list); return list; } /// <summary> /// Используем стоп-слово для строки /// </summary> public static string StopWord(this String str, string stop) { var index = str.IndexOf(stop, StringComparison.Ordinal); return index <= -1 ? str : str.Substring(0, index); } /// <summary> /// Используем старт-слово для строки /// </summary> public static string StartWord(this String str, string stop) { var index = str.IndexOf(stop, StringComparison.Ordinal); return index <= -1 ? str : str.Substring(index + stop.Length); } /// <summary> /// проверка ссылки и доработка /// </summary> /// <param name="str"></param> /// <param name="domen"></param> /// <returns></returns> public static string TrueLink(this String str, string domen="") { if (str.StartsWith("http")) return str; if (Regex.IsMatch(str, @"^\/{2}")) str = "http:" + str; else { str = Regex.Replace(str, @"^\/", ""); domen = Regex.Replace(domen, @"\/$", ""); str = domen + "/" + str; } return str; } /// <summary> /// Используем стоп-слово для строки /// </summary> public static string LastStopWord(this String str, string stop) { var index = str.LastIndexOf(stop, StringComparison.Ordinal); return index <= -1 ? str : str.Substring(0, index); } /// <summary> /// Используем последнее вхождение старт-слово для строки /// </summary> public static string LastStartWord(this String str, string stop) { var index = str.LastIndexOf(stop, StringComparison.Ordinal); return index <= -1 ? str : str.Substring(index + stop.Length); } /// <summary> /// Используем стоп-слово для строки /// </summary> public static string RemoveTagAttr(this String str, string replaceItems) { return SplitToNextMark(replaceItems).Aggregate(str, (current, item) => Regex.Replace(current, item + @"=(""[^""]*""|'[^']*')", "")); } /// <summary> /// Получаем домен из Product или Category /// </summary> /// <returns></returns> public static string GetDomen(Category category) { return "http://" + new Uri(category.SourceUrl).Host; } public static string GetDomen(Product product) { return "http://" + new Uri(product.Url).Host; } /// <summary> /// возвращает хешкод полученых данных,исключает отрицательное число /// сохраняет внутри класса переменную HashCod /// </summary> public static string GetHashCodeString(string item) { var hash = item.GetHashCode(); if (hash < 0) hash = hash * -1; return hash.ToString(CultureInfo.InvariantCulture); } /// <summary> /// заменяет имя изображения на SKU продукта /// </summary> /// <param name="imgHref">имя изображения(надо для получения рассширения изображения)</param> /// <param name="idx">порядовый номер изображения</param> /// <returns></returns> public static string MakeImgName2Sku(string imgHref, int idx) { var expanImg = new Regex(@"\.(jpg|png|bmp|gif|jpeg)", RegexOptions.IgnoreCase).Match(imgHref).Value;//list must add if (string.IsNullOrEmpty(expanImg)) expanImg = ".jpg"; if (idx > 1) { return HashCod + "-" + idx.ToString(CultureInfo.InvariantCulture) + expanImg; } return HashCod + expanImg; } /// <summary> /// все ссылки на фото в одной строке через разделитель /// </summary> /// <param name="listImages"></param> /// <param name="delimeter">string delimeter</param> /// <returns>string allImages</returns> public static string GetAllImageLinksAsString(List<string> listImages, string delimeter) { string links = listImages.Aggregate(string.Empty, (current, img) => current + (delimeter + img)); return links.Substring(delimeter.Length); } /// <summary> /// все имена фото в одной строке через разделитель /// </summary> /// <param name="listImages"></param> /// <param name="delimeter"></param> /// <param name="prefix"></param> /// <returns></returns> public static string GetAllImageNamesAsString(List<string> listImages, string delimeter, string prefix) { string names = listImages.Aggregate(string.Empty, (current, img) => current + (delimeter + prefix + img)); return names.Substring(delimeter.Length); } /// <summary> /// Делим строку на части по заданому сепаратору /// </summary> /// <param name="input">строка ввода</param> /// <param name="separator"> по умолчанию [next]</param> /// <returns> коллекция List с результатами </returns> public static List<string> SplitToNextMark(string input, string separator = "[next]") { var parts = input.Split(new[] { separator }, StringSplitOptions.RemoveEmptyEntries).ToList(); return parts; } /// <summary> /// Меняем разделители в стандартном urlrewrite /// </summary> /// <param name="str"></param> /// <param name="delim"></param> /// <returns></returns> public static string UrlRewrite(string str, string delim) { str = new Regex(@"_+").Replace(AttributableItem.UrlRewriteGet(str), delim); str = new Regex(string.Format("^{0}|{0}$", delim)).Replace(str, ""); return str; } /// <summary> /// Делим строку на части по заданому сепаратору /// </summary> /// <param name="xpath"></param> /// <param name="separator">по умолчанию [attr]</param> /// <returns>Одна пара с результатами</returns> public static KeyValuePair<string, string> GetPairsXpath2Attr(string xpath, string separator = "[attr]") { var pair = xpath.Split(new[] { separator }, StringSplitOptions.RemoveEmptyEntries); return new KeyValuePair<string, string>(pair[0], (pair.Length > 1) ? pair[1] : string.Empty); } /// <summary> /// Получаем имя из ссылки /// </summary> /// <param name="href"></param> /// <returns></returns> public static string CreateNameFromHref(string href) { var name = Regex.Replace(href, @"^[\w\W]*/", string.Empty); return name; } /// <summary> /// доработаная обертка TextUtils.ExtractValuesByXpath с использованием [next] и [attr] /// </summary> /// <param name="htmlDoc">HtmlAgilityPack.HtmlDocument</param> /// <param name="xpath">"xpath name[attr]xpath attribute [next] path name[attr]xpath attribute"</param> /// <returns></returns> public static List<string> ExtractValuesByXpath(HtmlDocument htmlDoc, string xpath) { foreach (var pair in SplitToNextMark(xpath).Select(part => GetPairsXpath2Attr(part))) { List<string> results; TextUtils.ExtractValuesByXpath(pair.Key, pair.Value, htmlDoc, true, out results); if (results != null && results.Count > 0) return results; } return new List<string>(0); } /// <summary> /// доработаная обертка TextUtils.ExtractFirstValuesByXpath с использованием [next] и [attr] /// </summary> /// <param name="htmlDoc">HtmlAgilityPack.HtmlDocument</param> /// <param name="xpath">"xpath name[attr]xpath attribute [next] path name[attr]xpath attribute"</param> /// <returns>string</returns> public static string ExtractFirstValueByXpath(HtmlDocument htmlDoc, string xpath) { foreach (var pair in SplitToNextMark(xpath).Select(part => GetPairsXpath2Attr(part))) { var result = TextUtils.ExtractFirstValueByXpath(pair.Key, pair.Value, htmlDoc); if (!string.IsNullOrEmpty(result)) return result; } return string.Empty; } public static void HtmlRestore(this HtmlPageLoader hpl) { var restoreHtml = new RestoreHtml(hpl); hpl.SetContent(restoreHtml.ProcessingLiTag(hpl.Content)); } class RestoreHtml { private readonly HtmlPageLoader _hpl; public RestoreHtml(HtmlPageLoader hpl) { _hpl = hpl; } public string ProcessingLiTag(string temp) { var text = string.Empty; _hpl.SetContent(temp); foreach (var item in ExtractValuesByXpath(_hpl.HtmlDoc, "/li")) { temp = item; while (true) { _hpl.SetContent(temp); var innerData = ExtractFirstValueByXpath(_hpl.HtmlDoc, "/li"); if (!string.IsNullOrEmpty(innerData)) { var ind = temp.IndexOf(innerData, StringComparison.Ordinal); if (ind <= -1) break; var res = temp.Substring(0, ind - 4); text += "<li>" + res + "</li>"; temp = innerData; } else { text += "<li>" + temp + "</li>"; break; } } } return "<ul>" + text + "</ul>"; } } } #endregion #region class ParametrsProccessing class ParametrsProcessing : CustomScript { private readonly HtmlPageLoader _hpl; private readonly Product _product; private bool _translate; private readonly string _htmlParameters; public CmsEngine Cms { get; set; } public List<string> NotTakedName { get; set; } public ParametrsProcessing(Product product, HtmlPageLoader hpl, string htmlParameters, bool translate = false) : base(product) { _translate = translate; _htmlParameters = htmlParameters; _hpl = hpl; _product = product; } /// <summary> /// Получаем характеристики с помощью xpath /// </summary> /// <param name="split">xPath разделитель</param> /// <param name="name">xPath имя свойства</param> /// <param name="value">xPath значение свойства</param> public void GetDinamicAttributesXpath(string split, string name, string value) { _hpl.SetContent(_htmlParameters); List<string> lines; TextUtils.ExtractValuesByXpath(split, "", _hpl.HtmlDoc, true, out lines); foreach (var line in lines) { _hpl.SetContent(line.Replace("\r", "").Replace("\t", "").Replace("\n", "")); var newColumnName = Helper.ExtractFirstValueByXpath(_hpl.HtmlDoc, name); var attributeValue = Helper.ExtractFirstValueByXpath(_hpl.HtmlDoc, value); if (!string.IsNullOrWhiteSpace(newColumnName) && !string.IsNullOrWhiteSpace(attributeValue)) AddAttibute(newColumnName, attributeValue); } } /// <summary> /// Получаем характеристики с помощью Regex /// </summary> /// <param name="split">Regex разделитель</param> /// <param name="name">Regex имя свойства</param> /// <param name="value">Regex значение свойства</param> public void GetDinamicAttributesRegex(string split, string name, string value) { _hpl.SetContent(_htmlParameters); var lines = _hpl.Content.Split(new [] { split }, StringSplitOptions.RemoveEmptyEntries); foreach (var line in lines) { var val = line.Replace("\r", "").Replace("\t", "").Replace("\n", ""); var columnName = Regex.Match(val, name).Groups[1].Value; var attributeValue = Regex.Match(val, value).Groups[1].Value; AddAttibute(columnName, attributeValue); } } private void AddAttibute(string columnName, string attrValue) { var regex = new Regex(@"<[^>]*>"); columnName = regex.Replace(columnName, "").Replace(":", "").Replace(";", ",").Replace("\"", "''").Trim(); attrValue = regex.Replace(attrValue, "").Replace("\"", "''").Replace(";", ",").Trim(); if (!CheckNameProperties(columnName) && !string.IsNullOrWhiteSpace(columnName) && !string.IsNullOrWhiteSpace(attrValue)) _product.AddDynamicAttribute(columnName, attrValue); } // ReSharper disable once UnusedMember.Local private void AddParametrsFromProduct() { var properties = string.Empty; if (Cms == CmsEngine.Prestashop) { properties = _product.GetDynamicAttributes().Aggregate(properties, (current, property) => current + string.Format(",{0}:{1}", property.Key, _product.GetAttributeValue(property.Value))); _product.SetAttributeValue("SYS_PROPERTIES_PRESTA", properties.Trim(',')); } if (Cms != CmsEngine.Advatshop) return; properties = _product.GetDynamicAttributes().Aggregate(properties, (current, property) => current + string.Format(",{0}:{1}", property.Key, _product.GetAttributeValue(property.Value))); _product.SetAttributeValue("SYS_PARAMETRS_ADVANTSHOP", properties.Trim(',')); } private bool CheckNameProperties(string name) { if (NotTakedName == null || NotTakedName.Count < 1) return false; return NotTakedName.Contains(name); } /// <summary> /// С помощью xpath получаем коллекцию с данными /// </summary> /// <param name="xpathSplit"> xpath блока по которому делаем Split</param> /// <param name="xpathData">xpath параметров , пишем в строку пример "//a[attr]href[next]//div[attr]class"</param> /// <returns></returns> public List<List<string>> GetDataFromXpath(string xpathSplit, string xpathData) { _hpl.SetContent(_htmlParameters); var list = new List<List<string>>(); List<string> lines; TextUtils.ExtractValuesByXpath(xpathSplit, "", _hpl.HtmlDoc, true, out lines); foreach (var line in lines) { var resultLine = new List<string>(); _hpl.SetContent(line); foreach (var xpath in GetArrayFromDelim("[next]", xpathData)) { string[] xp = GetArrayFromDelim("[attr]", xpath); List<string> res; TextUtils.ExtractValuesByXpath(xp[0], (xp.Length > 1) ? xp[1] : string.Empty, _hpl.HtmlDoc, false, out res); resultLine.Add(res.Count > 0 ? res[0] : string.Empty); } list.Add(resultLine); } return list; } private string[] GetArrayFromDelim(string delim, string input) { return input.Split(new [] { delim }, StringSplitOptions.None); } } enum CmsEngine { Prestashop, Advatshop } #endregion #region class ImageProcessing public class ImageProcessing : CustomScript { private readonly string _imagesBlock; private readonly bool _imageNameSku; private readonly string _domen; private readonly HtmlPageLoader _hpl; private readonly Product _product; private int _idx; private readonly int _countImg; /// <summary> /// </summary> /// <param name="product"></param> /// <param name="domen"></param> /// <param name="imagesBlock">block with image code</param> /// <param name="hpl"></param> /// <param name="imageNameSku"></param> public ImageProcessing(Product product, HtmlPageLoader hpl, bool imageNameSku, string domen, string imagesBlock) : this(product, hpl, imageNameSku, domen, imagesBlock, 0) { } public ImageProcessing(Product product, HtmlPageLoader hpl, bool imageNameSku, string domen, string imagesBlock, int countImg) : base(product) { _product = product; _hpl = hpl; _domen = domen; _imagesBlock = imagesBlock; _imageNameSku = imageNameSku; if (countImg > 0) _countImg = countImg; _idx = product.ImagesCount + 1; } public void GetAdditionalImages(string split, string contains, string reg) { var listSource = GetSourceImagesWithRegex(_imagesBlock, split, contains, reg); if (listSource.Count > 0) AddImage(listSource); } public void GetAdditionalImages(string xpath) { _hpl.SetContent(_imagesBlock); var listSource = Helper.ExtractValuesByXpath(_hpl.HtmlDoc, xpath); if (listSource.Count > 0) AddImage(listSource); } public static List<string> GetSourceImagesWithRegex(string blockAddImage, string split, string contains, string reg) { var parts = blockAddImage.Split(new[] { split }, StringSplitOptions.RemoveEmptyEntries); return ( from part in parts where contains != null && !(!string.IsNullOrEmpty(contains) & !part.Contains(contains)) from r in Helper.SplitToNextMark(reg) select new Regex(r).Match(part).Groups[1].ToString().Trim() into item where !string.IsNullOrWhiteSpace(item) select item ).ToList(); } private void AddImage(List<string> listSource) { if (_countImg > 0 && listSource.Count > _countImg) RemoveItemToList(ref listSource); foreach (var item in listSource) { if (string.IsNullOrWhiteSpace(item)) continue; var linkImg = item.TrueLink(_domen); if (linkImg == _product.ImageFull) continue; if (string.IsNullOrEmpty(_product.ImageFull)) _idx = 1; if (_imageNameSku) { _product.ImageAdd(linkImg, GetNameImageFromHref(item, _idx)); _idx++; } else _product.ImageAdd(linkImg); } } private void RemoveItemToList(ref List<string> listSource) { listSource.RemoveRange(_countImg, listSource.Count - _countImg); } /// <summary> /// Get expansion image and delegate him MakeImgName2Sku /// </summary> /// <param name="imgHref">path to image</param> /// <param name="idx"></param> /// <returns>string image from HashCod</returns> public string GetNameImageFromHref(string imgHref, int idx) { return Helper.MakeImgName2Sku(imgHref, idx); } } #endregion #region class WorksWithFiles class WorksWithFiles { public string PathToDir { get; private set; } public string PathToFile { get; private set; } public string NameFile { get; private set; } public string ExtenFile { get; private set; } private readonly string[] _lines; private readonly bool _notFile; public WorksWithFiles(string file) : this(file, null) { } public WorksWithFiles(string file, Encoding encoding) { GetValueFileVariables(file); if (ExtenFile != ".xlsx" && ExtenFile != ".xls") { if (!File.Exists(PathToFile)) { _notFile = true; _lines = new string[0]; } else { _lines = encoding != null ? File.ReadAllLines(PathToFile, encoding) : File.ReadAllLines(PathToFile); } } } /// <summary> /// Создаем файл для закачки фото, для старой версии Catalogloader /// Вызываем функцию для каждой фотки /// Имя фото создает из ссылки /// </summary> /// <param name="path">Путь к файлу</param> /// <param name="imageLink">ссылка на фото</param> public static void CreateImageDownloadFile(string path, string imageLink) { var text = string.Format("{0};{1}", imageLink, Helper.CreateNameFromHref(imageLink)); CreateTextFile(path, text); } /// <summary> /// Создаем текстовый файл /// Запись посторочно, при каждом вызове функции записывается одна строка /// </summary> /// <param name="path">путь к файлу</param> /// <param name="text"> текст </param> public static void CreateTextFile(string path, string text) { CreateTextFile(path, text, Encoding.UTF8); } /// <summary> /// Создаем текстовый файл /// Запись посторочно, при каждом вызове функции записывается одна строка /// </summary> /// <param name="path">путь к файлу</param> /// <param name="text"> текст </param> /// <param name="encoding"> кодировка файла </param> public static void CreateTextFile(string path, string text, Encoding encoding) { var startupPath = UtilSmall.ApplicationDataDirectory; path = path.Replace("/", "\\"); if (!path.Contains(startupPath)) { if (!Regex.IsMatch(path, @"^\\")) path = "\\" + path; path = startupPath + path; } if (encoding == null) encoding = Encoding.UTF8; if (!File.Exists(path)) using (var str = new StreamWriter(new FileStream(path, FileMode.Create), encoding)) { str.WriteLine(text); } else using (var str = new StreamWriter(new FileStream(path, FileMode.Append), encoding)) { str.WriteLine(text); } } /// <summary> /// Содержимое List пишется через ; в одну строку /// </summary> /// <param name="path">путь к файлу</param> /// <param name="texts"> коллекция с данными</param> public static void CreateTextFile(string path, List<string> texts) { CreateTextFile(path, texts, Encoding.UTF8); } /// <summary> /// Содержимое List пишется через ; в одну строку /// </summary> /// <param name="path">путь к файлу</param> /// <param name="texts"> коллекция с данными</param> /// <param name="encoding"> кодировка файла</param> public static void CreateTextFile(string path, List<string> texts, Encoding encoding) { var text = texts.Aggregate(string.Empty, (current, t) => current + (";" + t.Trim())); text = text.Substring(1); CreateTextFile(path, text, encoding); } /// <summary> /// Создаеем коллекцию из файла xslx /// </summary> /// <returns></returns> public Dictionary<string, string> CreateCollectionFromExcel() { if (_notFile || ExtenFile != ".xlsx") return new Dictionary<string, string>(0); var dic = new Dictionary<string, string>(); var sheet = GetParseXls(); for (var i = 0; i <= sheet.LastRowNum; i++) { try { dic[sheet.GetRow(i).GetCell(0).ToString().ToLowerInvariant().Trim()] = sheet.GetRow(i).GetCell(1).ToString().ToLowerInvariant().Trim(); } catch { break; } } return dic; } /// <summary> /// Из файла с 2 колонками , создаем коллекцию /// </summary> /// <param name="delimeter"> разделитель колонок из файла </param> /// <param name="reverse">если хотим использовать вторую колонку как ключ коллекции ставим true</param> /// <param name="valueString"> false - значения пар с одинаковыми ключами пропускаем, true- пишем их в строку с разделителем ,</param> public Dictionary<string, string> CreateCollection(string delimeter, bool reverse, bool valueString = false) { if (_notFile) return new Dictionary<string, string>(0); var dic = new Dictionary<string, string>(); foreach (var line in _lines) { var parts = line.Split(new [] { delimeter }, StringSplitOptions.RemoveEmptyEntries); if (parts.Length <= 1) continue; var name = parts[0].Replace("\"", string.Empty).ToLowerInvariant().Trim(); var val = parts[1].Replace("\"", string.Empty).ToLowerInvariant().Trim(); var searchN = name; var searchV = val; if (reverse) { searchV = name; searchN = val; } string res; if (!dic.TryGetValue(searchN, out res)) dic.Add(searchN, searchV); else if (valueString) dic[searchN] = res + "," + searchV; } return dic; } /// <summary> /// Создаем коллекцию с ключем и List как значение /// </summary> /// <param name="delimeter">разделитель из файлв</param> /// <param name="columnIndex">номер колонки которую используем как значение</param> /// <returns></returns> public Dictionary<string, List<string>> CreateCollection(string delimeter, int columnIndex = 0) { if (_notFile) return new Dictionary<string, List<string>>(0); var dic = new Dictionary<string, List<string>>(); foreach (var line in _lines) { var list = new List<string>(); var parts = line.Split(new[] { delimeter }, StringSplitOptions.RemoveEmptyEntries); if (parts.Length < 1 || parts.Length - 1 < columnIndex) continue; if(string.IsNullOrEmpty(parts[columnIndex])) continue; for (var i = 0; i < parts.Length; i++) { if (i == columnIndex) continue; var part = parts[i].Replace("\"", string.Empty).Trim(); list.Add(!string.IsNullOrWhiteSpace(part) && !list.Contains(part) ? part : string.Empty); } var name = parts[columnIndex].Replace("\"", string.Empty).Trim(); List<string> res; if (!dic.TryGetValue(name, out res)) dic.Add(name, list); } return dic; } /// <summary> /// коллекция значений из файла /// </summary> /// <returns></returns> public List<string> CreateCollection() { if (_notFile) return new List<string>(0); var list = new List<string>(); foreach (var l in _lines.Select(line => line.Replace("\"", string.Empty).Trim()).Where(l => !list.Contains(l))) { list.Add(l); } return list; } private void GetValueFileVariables(string file) { PathToFile = file; NameFile = Path.GetFileName(file); PathToDir = Path.GetDirectoryName(file); ExtenFile = Path.GetExtension(file); } /// <summary> /// Закачиваем файл /// </summary> /// <param name="linkFile">ссылка на файл</param> public void DownloadFile(string linkFile) { CreateDirs(); var wc = new WebClient(); wc.DownloadFile(linkFile, PathToFile); } /// <summary> /// Создаем не существующую директорию /// </summary> private void CreateDirs() { if (!Directory.Exists(PathToDir)) Directory.CreateDirectory(PathToDir); } /// <summary> /// Парсим екселевский файл /// </summary> /// <returns></returns> private NPOI.SS.UserModel.ISheet GetParseXls() { if (ExtenFile == ".xlsx") { NPOI.XSSF.UserModel.XSSFWorkbook workbook; using (var file = new FileStream(PathToFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { workbook = new NPOI.XSSF.UserModel.XSSFWorkbook(file); } return workbook.GetSheetAt(0); } else { NPOI.HSSF.UserModel.HSSFWorkbook workbook; using (var file = new FileStream(PathToFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { workbook = new NPOI.HSSF.UserModel.HSSFWorkbook(file); } return workbook.GetSheetAt(0); } } } #endregion #region CreateGrabCatalog public class CreateGrabCatalog : CustomScript { private readonly Category _mroot; private readonly HtmlPageLoader _hplCatalog; private readonly GrabCatalogBuildScriptParameters _p; private readonly string _domen; public CreateGrabCatalog(Category mroot, HtmlPageLoader hplCatalog, GrabCatalogBuildScriptParameters p, string domen) : base(mroot) { _domen = domen; _mroot = mroot; _hplCatalog = hplCatalog; _p = p; } /// <summary> /// добаляем категорию в грабкаталог /// </summary> /// <param name="lCat"></param> /// <param name="id">ID категории родителя</param> /// <returns></returns> public void AddCategories(List<Category> lCat, string id) { foreach (var c in lCat) { AddCategory(c, id); } } public void AddCategory(Category c, string id) { var root = _mroot.FindCategoryById(id); if (c.SourceUrl == "#") { _p.Process.m_ti.AddLogError("The Category didn't add : url category = " + c.SourceUrl ); return; } c.SourceUrl = c.SourceUrl.TrueLink(_domen); var cat = root.AddCategory(c.Name, true); cat.SourceUrl = c.SourceUrl; c.ID = cat.ID; _p.Process.m_ti.AddLogInfo("Added category level " + cat.Level + " '" + cat.GetFullName() + "', url " + cat.SourceUrl); } /// <summary> /// Собираем ссылки со страницы где имя категории text(), а ссылка атрибут /// </summary> /// <param name="xpathName">xpath[attr]attr </param> /// <param name="xpathLink">xpath[attr]attr </param> /// <param name="root">категория родитель</param> /// <param name="htmlPage"></param> /// <returns></returns> public List<Category> CreateListCategory(string xpathName, string xpathLink, Category root, string htmlPage = null) { if (htmlPage == null) _hplCatalog.Load(root.SourceUrl); else _hplCatalog.SetContent(htmlPage); var lc = new List<Category>(); var name = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathName); var link = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathLink); if (name.Count != link.Count || link.Count <= 0) return lc; lc.AddRange(name.Select((t, i) => new Category {Name = Regex.Replace(t, @"<[^>]*>", "").Trim(), SourceUrl = link[i]})); return lc; } /// <summary> /// Берем категории с одной страницы /// </summary> /// <param name="html"></param> /// <param name="xpathBlock"></param> /// <param name="xpathName"></param> /// <param name="xpathLink"></param> /// <param name="root"></param> /// <returns></returns> public Dictionary<Category, string> CreateDictionaryCategoryOnePage(string html, string xpathBlock, string xpathName, string xpathLink, Category root) { _hplCatalog.SetContent(html); var lc = new Dictionary<Category, string>(); var name = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathName); var link = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathLink); var blockHtml = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathBlock); if (name.Count <= 0) return lc; for (var a = 0; a < name.Count; a++) { var cat = new Category {Name = name[a].Trim(), SourceUrl = (link.Count == name.Count) ? link[a] : _domen}; lc.Add(cat, (blockHtml.Count > 0) ? blockHtml[a] : string.Empty); } return lc; } public Dictionary<Category, string> CreateDictionaryCategoryOnePage(string html, string xpathBlockAll, string xpathBlock, string xpathName, string xpathLink, Category root) { _hplCatalog.SetContent(html); var lc = new Dictionary<Category, string>(); var blocks = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathBlockAll); foreach (var block in blocks) { _hplCatalog.SetContent(block); var name = Helper.ExtractFirstValueByXpath(_hplCatalog.HtmlDoc, xpathName); var link = Helper.ExtractFirstValueByXpath(_hplCatalog.HtmlDoc, xpathLink); var blockHtml = Helper.ExtractFirstValueByXpath(_hplCatalog.HtmlDoc, xpathBlock); if(string.IsNullOrEmpty(name)) continue; var cat = new Category { Name = name, SourceUrl = !string.IsNullOrEmpty(link) ? link : _domen}; lc.Add(cat, !string.IsNullOrEmpty(blockHtml) ? blockHtml : string.Empty); } return lc; } } #endregion #region HtmlData public class HtmlData { public string Text { get; set; } public string Value { get; set; } public string Title { get; set; } public string Alt { get; set; } public string Style { get; set; } public string Name { get; set; } public string Html { get; set; } public string Href { get; set; } public string Id { get; set; } public string Src { get; set; } public string Class { get; set; } public string OnClick { get; set; } public Dictionary<string, string> Attr2Value { get; set; } public HtmlData() { Attr2Value = new Dictionary<string, string>(); Text = Value = Style = Name = Html = Href = Id = Src = Class = OnClick = Title = Alt= string.Empty; } public double ToDouble(string str = "") { if (!string.IsNullOrEmpty(str) && !Attr2Value.ContainsKey(str)) new Exception("This attribute is not found"); str = string.IsNullOrEmpty(str) ? Text : Attr2Value[str]; str = Regex.Replace(str, @"\r|\t|\n|\s", "").Trim(); str = Regex.Match(str, @"(\d+,*\d*)").Groups[1].Value.Trim(','); double res; if (double.TryParse(str, out res)) return res; new Exception("This format is not suitable"); return 0; } public int ToInt32(string str = "") { if (!string.IsNullOrEmpty(str) && !Attr2Value.ContainsKey(str)) new Exception("This attribute is not found"); str = string.IsNullOrEmpty(str) ? Text : Attr2Value[str]; str = Regex.Replace(str, @"\r|\t|\n|\s", "").Replace(",", ".").Trim(); str = Regex.Match(str, @"(\d+)").Groups[1].Value.Trim(','); int res; if (int.TryParse(str, out res)) return res; new Exception("This format is not suitable"); return 0; } /// <summary> /// Extact HtmlData object /// </summary> /// <param name="hpl"></param> /// <param name="xpath">not [attr]</param> /// <returns></returns> public static HtmlData ExtractFirstHtmlData(HtmlPageLoader hpl, string xpath) { return Helper.SplitToNextMark(xpath).Select(item => hpl.HtmlDoc.DocumentNode.SelectSingleNode(item)).Select(GetHtmlData).FirstOrDefault(hd => hd != null); } /// <summary> /// Extact HtmlData object /// </summary> /// <param name="hpl"></param> /// <param name="xpath">not [attr]</param> /// <returns></returns> public static List<HtmlData> ExtractListHtmlData(HtmlPageLoader hpl, string xpath) { var res = new List<HtmlData>(); foreach (var xp in Helper.SplitToNextMark(xpath)) { var htmlNodes = hpl.HtmlDoc.DocumentNode.SelectNodes(xp); if (htmlNodes == null) continue; res.AddRange(htmlNodes.Select(GetHtmlData).Where(htmlData => htmlData != null)); if (res.Count > 0) break; } return res; //return (from htmlNodes in Helper.SplitToNextMark(xpath).Select(item => hpl.HtmlDoc.DocumentNode.SelectNodes(item)) from htmlNode in htmlNodes select GetHtmlData(htmlNode)).ToList(); } private static HtmlData GetHtmlData(HtmlNode htmlNode) { if (htmlNode == null) return new HtmlData(); var hd = new HtmlData { Text = HttpUtility.HtmlDecode(htmlNode.InnerText), Html = HttpUtility.HtmlDecode(htmlNode.InnerHtml) }; foreach (var pair in htmlNode.Attributes) { hd.Attr2Value.Add(pair.Name, pair.Value); var value = HttpUtility.HtmlDecode(pair.Value); switch (pair.Name) { case "style": hd.Style = value; break; case "name": hd.Name = value; break; case "value": hd.Value = value; break; case "href": hd.Href = value; break; case "src": hd.Src = value; break; case "id": hd.Id = value; break; case "class": hd.Class = value; break; case "onclick": hd.OnClick = value; break; case "title": hd.Title = value; break; case "alt": hd.Alt = value; break; } } return hd; } } #endregion }