MyDataProvider » Blog » walgreens c# webscraper

walgreens c# webscraper

  • by

using System;
using System.Collections.Generic;
using System.Drawing;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using CatalogLoader;
using CatalogLoaderCommon;
using HtmlAgilityPack;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;

//css_reference HtmlAgilityPack;
//css_reference NPOI.dll
//css_reference NPOI.OOXML.dll
//css_reference NPOI.OpenXml4Net.dll
//css_reference NPOI.OpenXmlFormats.dll
//css_reference ICSharpCode.SharpZipLib.dll

namespace CatalogLoaderVSScriptEditor
{
    public class CustomScript : CustomScriptBase
    {
        public HtmlPageLoader HplProduct { get; set; }
        public HtmlPageLoader HplCatalog { get; set; }
        public HtmlPageLoader HplLinks { get; set; }

        private bool _imageNameSku;
        private string _hashCod;
        private string _domen;
        private Product _product;
        private Category _category;
        private TaskInfo _mti;
        string _domain = "https://www.walgreens.com";

        private GrabProcessState _gps;

        public CustomScript() { }

        public CustomScript(Category mroot)
        {
            _category = mroot;
        }

        public CustomScript(Product product)
        {
            _product = product;
        }

        List<string> _upcList = new List<string>();

        public override void Login(LoginScriptParameters p)
        {
            _gps = p.State as GrabProcessState;
            _mti = p.Process.m_ti;

            string inputFilePath = Path.Combine(UtilSmall.ApplicationDataDirectory, "InputFile.txt");//_gps.GrabberSettings.UserParameterGet("InputFile");
            FromFileRead(inputFilePath);
            //var client = new WebClientWithCookies();
            //const string loginUrl = @"";
            //client.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
            //client.Headers.Add("Accept-Language", "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3");
            //client.Headers.Add("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0");
            //client.Headers.Add("Content-Type", "application/x-www-form-urlencoded");
            //client.Headers.Add("Host", new Uri(loginUrl).Host);
            //client.Headers.Add("Referer", loginUrl);
            //client.Encoding = Encoding.UTF8;

            //const string data = @"";
            //var str = client.UploadString(loginUrl, "POST", data);

            //if (str.Contains("logout"))
            //{
            //    p.SessionCookieCollection = client.SessionCookieContainer.GetCookies(new Uri(loginUrl));
            //}
            //else
            //{
            //    throw new Exception("was not logged in ...");
            //}
        }

        public void FromFileRead(string filePath)
        {
            _upcList = File.ReadAllLines(filePath).ToList();
        }
        public override void GrabCatalogBuild(GrabCatalogBuildScriptParameters p)
        {
            //Init(p);
            _mti = p.Process.m_ti;
            _category = new Category();


            var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty();


            hpl.Load(_domain);
            var script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(),'__HEADER_INITIAL_STATE__')]", "", false);
            var regex = new Regex(@"{[\w\W]+}};");
            script = regex.Match(script).Value.TrimEnd(';');
            var jobjectScript = JObject.Parse(script);

            if(jobjectScript.SelectToken("$.header.headNavData.menu-shop-products.categories") != null)
            {
                foreach(var cat in jobjectScript.SelectToken("$.header.headNavData.menu-shop-products.categories"))
                {
                    var category = new Category();
                    category.Name = cat.SelectToken("$.name").ToString();
                    category.SourceUrl = _domain + cat.SelectToken("$.url").ToString();
                    _category.AddCategory(category);
                    if(cat.SelectToken("$.categories") != null)
                    {
                        SubCategoryAdd(cat.SelectToken("$.categories"), category);
                    }
                }
            }
            

            /* List<string> Link = new List<string>();
            List<string> Name = new List<string>();
            var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty();
            var CurrentUrl = _domain;
            hpl.Load(CurrentUrl);


            TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[@class=\"menu\"]//li[position() >1 and position() < 7]/a[@class=\"level-top\"]/span", "", false, 0, true, out Name);
            TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[@class=\"menu\"]//li[position() >1 and position() < 7]/a[@class=\"level-top\"]", "href", true, 0, true, out Link);
            var category = new Category();

            if (Link.Count != 0 && Link.Count == Name.Count)
            {
                for (int i = 0; i < Link.Count; i++)
                {
                    category.Name = Name[i];
                    category.SourceUrl = Link[i];
                    SubCategoryAdd(category, hpl, "");
                    _category.AddCategory(category);
                }
            }*/



            p.Root = _category;
        }

        public void SubCategoryAdd(JToken script, Category parent)
        {
            foreach(var cat in script)
            {
                var category = new Category();
                category.Name = cat.SelectToken("$.name").ToString();
                category.SourceUrl = _domain + cat.SelectToken("$.url").ToString();
                parent.AddCategory(category);
                if(cat.SelectToken("$.categories") != null)
                {
                    SubCategoryAdd(cat.SelectToken("$.categories"), category);
                }
            }
        }
      

        public override void ProcessFinished(ProcessFinishedScriptParameters p) { }
        public override void RunCategory(RunCategoryScriptParameters p) { }

        public override void GetProductLinksForCategory(GetProductLinksForCategoryScriptParameters p)
        {
            //  Init(p);
            // _mti = new TaskInfo();
            //https://www.walgreens.com/search/results.jsp?Ntt=300411848


            var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty();

            foreach (var upc in _upcList)
            {
                _mti.AddLogInfo("Upc:" + upc);

                hpl.Load("https://www.walgreens.com/search/results.jsp?Ntt=" + upc);
                    //product-container
                    var productLink = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//ul[@class='product-container']/li[@class='item card card__product']//a", "href", true);
                    p.Category.ProductLinks.Add(_domain + productLink);
                _mti.AddLogInfo("product link count:" + p.Category.ProductLinks.Count());
                
            }
            //var CurrentUrl = p.Category.SourceUrl;


            //    hpl.Load(CurrentUrl);
            //    /*if (hpl.Content.Contains(""))
            //    {
            //        hpl.Load(CurrentUrl);
            //    }
            //    */
            //    var regex = new Regex(@"[0-9]{3,}");
            //    var N = regex.Match(p.Category.SourceUrl).Value;

            //    var productCount = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//p[@id='resultcount']/strong", "", false);
            //    if(string.IsNullOrEmpty(productCount))
            //    {
            //        productCount = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[@id='resultcount']/strong", "", false);
            //    }
            //    while (true)
            //    {
            //        if(!_mti.CanContinue())
            //        {
            //            break;
            //        }

            //            var script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(), 'window.__APP_INITIAL_STATE__')]", "", false).Replace("window.__APP_INITIAL_STATE__ = ", "").TrimEnd(';');
            //            if(string.IsNullOrEmpty(script))
            //            {
            //                script = TextUtils.GetHtmlValue(hpl.HtmlDoc, "//script[contains(text(), 'window.getInitialState=function')]", "", false).Replace("window.getInitialState=function(){  return ", "").TrimEnd(';');
            //            }
            //        script = script.TrimEnd('}') + "}}";
            //            var jObjectScript = JObject.Parse(script);
            //            if (jObjectScript.SelectTokens("$.searchResult.productList") != null || jObjectScript.SelectTokens("$.searchResult.productList").Count() != 0)
            //            {

            //                foreach (var productList in jObjectScript.SelectToken("$.searchResult.productList"))
            //                {
            //                    p.Category.ProductLinks.Add(_domain + productList.SelectToken("$.productInfo.productURL").ToString());

            //                if (!_gps.GrabberSettings.Settings.IsMaxProductsInCategoryNull() && _gps.GrabberSettings.Settings.MaxProductsInCategory > 0 && p.Category.ProductLinks.Count >= _gps.GrabberSettings.Settings.MaxProductsInCategory)
            //                {

            //                    goto OUT_OF_FUNC;
            //                }
            //            }
            //            }
            //            else
            //            {

            //                break;
            //            }
            //            if (Convert.ToInt32(productCount) > 80)
            //            {

            //                CurrentUrl = "https://www.walgreens.com/store/store/category/productlist.jsp?webExc=true&N=" + N + "&No=" + p.Category.ProductLinks.Count();
            //                if (!hpl.Load(CurrentUrl))
            //                {
            //                    break;
            //                }
            //            }
            //            else
            //        {
            //            break;
            //        }
            //            if (p.Category.ProductLinks.Count() >= Convert.ToInt32(productCount))
            //            {
            //                break;
            //            }
            //    }
            //OUT_OF_FUNC:
            //    var k = "";

        }

        public override void RunProduct(RunProductScriptParameters p)
       {

            p.Product.BarCode = p.Product.Url.Split(new string[] { "Ntt=" }, StringSplitOptions.None).Last();

            string categoryPath = p.Product.GetAttributeValue("CATEGORY_PATH");
            if(!string.IsNullOrEmpty(categoryPath))
            {
                var hDoc = new HtmlDocument();
                hDoc.LoadHtml(categoryPath);
                var node = hDoc.DocumentNode.SelectNodes("./li");

                List<string> categoryLinks = new List<string>();
                List<string> categoryName = new List<string>();

                foreach(var n in node)
                {
                    categoryLinks.Add("https://#");
                    categoryName.Add(n.SelectSingleNode("./a").InnerHtml);
                }

                var c = new Category();
                var cLast = CategoriesAdd(0, categoryName, categoryLinks, c);
                p.Product.Category = cLast;
            }
            var k = "";
            //string imagesHtml = p.Product.GetAttributeValue("IMAGES_HTML"); 
            //var script = p.Product.GetAttributeValue("Script");
           // string productData = p.Product.GetAttributeValue("PRODUCT_DATA_HTML");
           // string script = p.Product.GetAttributeValue("SCRIPT_HTML");
           // var size = p.Product.GetAttributeValue("SIZE_HTML");
           // string inStock = p.Product.GetAttributeValue("IN_STOCK");
           // string productFullSize = p.Product.GetAttributeValue("PRODUCT_FULL_SIZE");

           //  var art = p.Product.Art;
           // if(art.Contains("react-text"))
           // {
           //     var regex = new Regex(@"[0-9]{3,}");
           //     art = regex.Match(art).Value;
           //     p.Product.Art = art;
                
           // }
           ///* var price = p.Product.Price;
           // if(price.Contains("<sup>"))
           // {
           //     var regex = new Regex(@"[0-9]+");
           //     var pr = regex.Matches(price);
           //     p.Product.Price = pr[0].ToString() + "." + pr[1].ToString();

           // }*/

           // var weight = p.Product.Weight;
           // if(weight.Contains("react-text"))
           // {
           //     var regex = new Regex("-->"+@"[0-9]+\W*[0-9]*");
           //     var NewWeight = regex.Match(weight).Value.Replace("-->","");
           //         p.Product.Weight = NewWeight;
           // }
           //// art = art.

           //if(!string.IsNullOrEmpty(productFullSize))
           // {
           //     p.Product.AddDynamicAttribute("Size of the package", productFullSize);
           // }

           // var name = p.Product.Name;
            
           // var hDoc = new HtmlDocument();

           ///* if(!string.IsNullOrEmpty(imagesHtml))
           // {
           //     hDoc.LoadHtml(imagesHtml);
           //     ImageClp imageClp;
           //     List<ImageClp> images = new List<ImageClp>();

           //     List<string> LinksToImages;
           //     TextUtils.GetHtmlValue(hDoc, "//img", "src", true, 0, true, out LinksToImages);
                
           //     foreach (var link in LinksToImages)
           //     {
           //         imageClp = new ImageClp();
           //         if(LinksToImages.Count == 1)
           //         {
           //             imageClp.Url = link;
           //         }
           //         else
           //         {
           //             imageClp.Url = "https:" + link.Replace("100", "900");
           //         }
           //         images.Add(imageClp);
           //     }
                
           //     p.Product.ImageSafeAdd(images); 

           // }
           // else//інколи не загружається фотограція через v4 пробую по іншому.
           // {
           //     var hpl = _gps.Proxy.GetHtmlPageLoaderEmpty();
           //     var CurrentUrl = p.Product.Url;
           //     hpl.Load(CurrentUrl);
           //     ImageClp imageClp;
           //     List<ImageClp> images = new List<ImageClp>();
           //     List<string> LinksToImages = new List<string>();
           //     TextUtils.GetHtmlValue(hpl.HtmlDoc, "//ul[@id='thumbnailImages']//img", "src", true, 0, true, out LinksToImages);
           //     if(LinksToImages.Count  == 0)
           //     {
           //         TextUtils.GetHtmlValue(hpl.HtmlDoc, "//div[@id='zoomLensContainer']//img", "src", true, 0, true, out LinksToImages);
           //     }
           //     foreach (var link in LinksToImages)
           //     {
           //         imageClp = new ImageClp();
           //         if (LinksToImages.Count == 1)
           //         {
           //             imageClp.Url = link;
           //         }
           //         else
           //         {
           //             imageClp.Url = "https:" + link.Replace("100", "900");
           //         }
           //         images.Add(imageClp);
           //     }

           //     p.Product.ImageSafeAdd(images);

           // }*/
           // if(!string.IsNullOrEmpty(inStock))
           // {
           //     p.Product.Quantity = "1";
           // }
           // else
           // {
           //     p.Product.Quantity = "15";
           // }
           // if(!string.IsNullOrEmpty(size))
           // {
           //     if (size.Contains("react-text"))
           //     {
           //         var regex = new Regex(@"-->\d+[\w\W]+");
           //         size = regex.Match(size).Value.Replace("-->", "");
           //         var sizes1 = size.Split('x');
           //         p.Product.Width = sizes1[1];
           //         p.Product.Height = sizes1[2];
           //         p.Product.Depth = sizes1[0];

           //     }
           //     else
           //     {
           //         var sizes = size.Split('x');
           //         p.Product.Width = sizes[1];
           //         p.Product.Height = sizes[2];
           //         p.Product.Depth = sizes[0];
           //     }
           // }

           // if (!string.IsNullOrEmpty(script))
           // {
           //     /*var regex = new Regex(@"{[\w\W]+}}};");
           //     script = regex.Match(script).Value.TrimEnd(';');

           //     regex = new Regex("%3" + @"[\w\W]+%20");

           //     var description = regex.Match(script).Value;
                
           //     var jObjectScript = JObject.Parse(script);*/

           //     /*if (jObjectScript.SelectToken("$.product.results.prodDetails.section.[0].description.productDesc") != null)
           //     {
           //         var Descriptions = jObjectScript.SelectToken("$.product.results.prodDetails.section.[0].description.productDesc").ToString();
           //         Descriptions = System.Uri.UnescapeDataString(Descriptions);
           //         p.Product.FullDescription = Descriptions;*/
           //     var regex = new Regex("\"productDesc\":\"" + @"[\w\W]+" + "\",\"quickView\"");
           //     string Descriptions = regex.Match(script).Value;

           //     if(!string.IsNullOrEmpty(Descriptions))
           //     {
           //         script = script.Replace(Descriptions.Replace("\"quickView\"", ""), "");
           //     }
               
           //     //regex = new Regex(",{\"warnings\"" + @"[\w\W]+" + "\"}}");
           //     //var tmp = regex.Match(script).Value;
           //     regex = new Regex(@"{[\w\W]+}}};");
           //     script = regex.Match(script).Value.TrimEnd(';');
           //     regex = new Regex("\"product\"" + @"[\w\W]+" + ",\"shippingOverlay\"");
           //     script = "{" + regex.Match(script).Value.Replace(",\"shippingOverlay\"", "") + "}";
           //     regex = new Regex(",{\"warnings\"" + @"[\w\W]+" + ",{\"shipping\"");
           //     var tmp = regex.Match(script).Value.Replace(",{\"shipping\"", "");
           //     if (!string.IsNullOrEmpty(tmp))
           //     {
           //         script = script.Replace(tmp, "");
           //     }
           //     regex = new Regex("\"description\":{\"" + @"[\w\W]+" + "}]},\"productInfo\"");
           //     tmp = regex.Match(script).Value.Replace("}]},\"productInfo\"", "");
           //     if (!string.IsNullOrEmpty(tmp))
           //     {
           //         script = script.Replace(tmp, "");
           //     }
           //     var jObjectScript = JObject.Parse(script);




           //     if (jObjectScript.SelectToken("$.product.results.priceInfo.salePrice") != null)
           //     {
           //         p.Product.Price = jObjectScript.SelectToken("$.product.results.priceInfo.salePrice").ToString().Replace("$", "");
           //         p.Product.PriceOld = jObjectScript.SelectToken("$.product.results.priceInfo.regularPrice").ToString().Replace("$","");
           //     }
           //     else
           //     {
           //         p.Product.Price = jObjectScript.SelectToken("$.product.results.priceInfo.regularPrice").ToString().Replace("$","");
           //     }

           //     if(jObjectScript.SelectToken("$.product.results.productInfo.filmStripUrl") != null)
           //     {
           //         int counter = 1;
           //         foreach(var images in jObjectScript.SelectToken("$.product.results.productInfo.filmStripUrl"))
           //         {
           //             if(images.SelectToken("$.zoomImageUrl" + counter)!= null)
           //             {
           //                 p.Product.ImageAdd("https:" +images.SelectToken("$.zoomImageUrl" + counter).ToString());
           //             }
           //             else if(images.SelectToken("$.largeImageUrl" + counter) != null)
           //             {
           //                 p.Product.ImageAdd("https:" + images.SelectToken("$.largeImageUrl" + counter).ToString());
           //             }
           //             counter++;
           //         }
           //     }

           //     var combinationJObject = jObjectScript.SelectToken("$.product.results.inventory.relatedProducts");
           //     List<Combination> ProductCombination = new List<Combination>();

           //     if (combinationJObject.Count() > 0)
           //     {
           //         foreach(var com in combinationJObject)
           //         {
                       
           //             JProperty jProperty = com.ToObject<JProperty>();
           //             foreach (var comData in com)
           //             {
           //                 foreach(var data in comData )
           //                 {
           //                     var combination = new Combination();
           //                     var k = data.SelectToken("$.value").ToString();
           //                     if (data.SelectToken("$.value") != null)
           //                     {
           //                         combination.AddDynamicAttribute(jProperty.Name.ToString(), data.SelectToken("$.value").ToString());
           //                     }
           //                     combination.Price = data.SelectToken("$.priceInfo.regularPrice").ToString().TrimStart('$');
           //                     if (data.SelectToken("$.isavlbl").ToString() == "yes")
           //                     {
           //                         combination.Quantity = "10";
           //                     }
           //                     else
           //                     {
           //                         combination.Quantity = "1";
           //                     }
           //                     if (data.SelectToken("$.key") != null)
           //                     {
           //                         combination.Art = data.SelectToken("$.key").ToString().Replace("sku", "");
           //                     }
           //                     ProductCombination.Add(combination);
           //                 }
                            
           //             }

           //         }
           //     }
           //     p.Product.CombinationsAdd(ProductCombination);

           //     if (!string.IsNullOrEmpty(Descriptions))
           //         {
           //             Descriptions = Descriptions.Replace("\"productDesc\":\"","").Replace("\",\"quickView\"", "");
           //         }

           //         Descriptions = System.Uri.UnescapeDataString(Descriptions);
           //         p.Product.FullDescription = Descriptions;
           //         if (!string.IsNullOrEmpty(productData))
           //         {
           //             hDoc.LoadHtml(productData);
           //             var text = TextUtils.GetHtmlValue(hDoc, "//li[@id='Ingredients']", "", false);
           //             if(!string.IsNullOrEmpty(text))
           //             {
           //                  p.Product.FullDescription += text;
           //             }
           //             text = TextUtils.GetHtmlValue(hDoc, "//li[@id='Warnings']", "", false);
           //             if(!string.IsNullOrEmpty(text))
           //             {
           //                 p.Product.FullDescription +=text;
           //             }
                   
           //         }
           //     //}
           // }
        }

        public Category CategoriesAdd(int index, List<string> CategoriesNames, List<string> CategoriesLinks, Category parent)
        {
            //var last =  new Category();
            var category = new Category();
            if (index < CategoriesLinks.Count())
            {
                category.Name = CategoriesNames[index];
                category.SourceUrl = CategoriesLinks[index];
                parent.AddCategory(category);
                index++;
                var last = CategoriesAdd(index, CategoriesNames, CategoriesLinks, category);
                return last;

            }
            else
            {
                return parent;
            }

        }


        #region Settings

        private void Init(RunProductScriptParameters p)
        {
            _mti = p.Process.m_ti;
            _product = p.Product;
            _category = p.Category;

            if (HplProduct != null) return;

            var hpl = p.Process as OneProductLoader;

            if (string.IsNullOrEmpty(_domen))
                _domen = hpl.State.GrabberSettings.Settings.ShopUrl;

            if (hpl != null)
                HplProduct = Helper.Hpl = hpl.State.Proxy.GetHtmlPageLoader(_domen);
        }

        private void Init(GetProductLinksForCategoryScriptParameters p)
        {
            _mti = p.Process.m_ti;
            _category = p.Category;

            if (HplLinks != null) return;

            var hpl = p.Process as OneCategoryLoader;

            if (string.IsNullOrEmpty(_domen))
                _domen = hpl.State.GrabberSettings.Settings.ShopUrl;
            
            if (hpl != null)
                HplLinks = hpl.State.Proxy.GetHtmlPageLoader(_domen);
        }

        private void Init(GrabCatalogBuildScriptParameters p)
        {
            _mti = p.Process.m_ti;
            _category = new Category { ID = "0" };

            if (HplCatalog != null) return;

            var hpl = p.Process as GrabCatalogFromWeb;

            if (string.IsNullOrEmpty(_domen))
                _domen = hpl.State.GrabberSettings.Settings.ShopUrl;
            
            if (hpl != null)
                HplCatalog = hpl.State.Proxy.GetHtmlPageLoader(_domen);
        }

        private void StartProduct(Product product)
        {
            product.Name = HttpUtility.HtmlDecode(product.Name);
            _hashCod = Helper.HashCod = Helper.GetHashCodeString(product.Name + product.Url);

            if (!string.IsNullOrEmpty(product.FullDescription))
            {
                product.FullDescription = product.FullDescription.RemoveTagAttr("id[next]class");
                product.FullDescription = Regex.Replace(product.FullDescription, @"\r|\n|\t|\s{2,}", "");
            }

            // удаляем аттрибуты тегов в кратком описании
            if (!string.IsNullOrEmpty(product.SmallDescription))
            {
                product.SmallDescription = product.SmallDescription.RemoveTagAttr("id[next]class");
                product.SmallDescription = Regex.Replace(product.SmallDescription, @"\r|\n|\t|\s{2,}", "");
            }

            if (string.IsNullOrEmpty(product.ID))
                product.ID = _hashCod;

            if (string.IsNullOrEmpty(product.GetAttributeValue("SYS_PRODUCT_ART")))
                product.SetAttributeValue("SYS_PRODUCT_ART", product.ID);

            product.ImageAdd(string.IsNullOrEmpty(product.GetAttributeValue("SYS_PRODUCT_MAIN_IMAGE"))
                ? product.GetAttributeValue("SYS_PRODUCT_MAIN_IMAGE_SMALL")
                : product.GetAttributeValue("SYS_PRODUCT_MAIN_IMAGE"));

            if (_imageNameSku)
                product.Image = Helper.MakeImgName2Sku(product.ImageFull, 0);

            product.Price = product.Price.GetTruePrice();

            if (!string.IsNullOrEmpty(product.UrlRewrite))
                product.UrlRewrite = Helper.UrlRewrite(product.UrlRewrite, "-");
            
            
        }

        private void FinishProduct(Product product)
        {
            product.HtmlBlocksClean();
            product.Price = product.Price.GetTruePrice();

            // удаляем аттрибуты тегов в полном описании
            if (!string.IsNullOrEmpty(product.FullDescription))
                product.FullDescription = product.FullDescription.RemoveTagAttr("id[next]class");

            // удаляем аттрибуты тегов в кратком описании
            if (!string.IsNullOrEmpty(product.SmallDescription))
                product.SmallDescription = product.SmallDescription.RemoveTagAttr("id[next]class");

            if (!product.PriceIsOk)
            {
                _mti.AddLogError("Price: '" + product.Price +
                                 "' does not exist or not configured and will be set 0. Link product: \" " + product.Url +
                                 " \"");
                product.Price = "0";
            }

            /*if (product.ImagesCount > 0)
            {
                product.SetAttributeValue("SYS_IMAGE_LINK_ALL", Helper.GetAllImageLinksAsString(product.ImagesGet().Keys.ToList<string>(), ","));
                product.SetAttributeValue("SYS_IMAGE_NAME_ALL", Helper.GetAllImageNamesAsString(product.ImagesGet().Values.ToList<string>(), ",", "").Trim());
            }*/
        }
    
        #endregion

    }

    #region class Helper
    public static class Helper
    {
        public static string HashCod = "";
        public static HtmlPageLoader Hpl;
        public static Dictionary<string, string> TranslatePairs { get; set; }
	
	
        public static string GetTruePrice(this string str)
        {
            str = str.Replace(".", ",");
            str = Regex.Replace(str, @"\s*", "");
            return str;
        }
	
        public static string Translate(this String str, bool wordTranslate = false)
        {
            if (!wordTranslate)
            {
                foreach (var word in str.Split(new[] { ',', '.', '!', '?', ';', ':', '(', ')', '"', '[', ']', '{', '}' }).Where(word => !string.IsNullOrEmpty(word)))
                    str = str.Replace(word, GetTranslate(word));
            }

            // потом перебираю слова
            var text = str.Split(new[] { ' ' }).Aggregate(string.Empty, (current, word) => current + (" " + GetTranslate(word)));
            return text.Trim();
        }

        private static string GetTranslate(string str)
        {
            if (TranslatePairs == null)
                TranslatePairs = new Dictionary<string, string>();
            string trWord;
            return TranslatePairs.TryGetValue(str.ToLowerInvariant().Trim(), out trWord) ? trWord : str.Trim();
        }

        public static List<string> GetListFromBlock(string input, string xpath, string attr)
        {
            List<string> list;
            Hpl.SetContent(input);
            TextUtils.ExtractValuesByXpath(xpath, "", Hpl.HtmlDoc, true, out list);
            return list;
        }

        /// <summary>
        /// Используем стоп-слово для строки
        /// </summary>
        public static string StopWord(this String str, string stop)
        {
            var index = str.IndexOf(stop, StringComparison.Ordinal);
            return index <= -1 ? str : str.Substring(0, index);
        }

        /// <summary>
        /// Используем старт-слово для строки
        /// </summary>
        public static string StartWord(this String str, string stop)
        {
            var index = str.IndexOf(stop, StringComparison.Ordinal);
            return index <= -1 ? str : str.Substring(index + stop.Length);
        }

        /// <summary>
        /// проверка ссылки и доработка
        /// </summary>
        /// <param name="str"></param>
        /// <param name="domen"></param>
        /// <returns></returns>
        public static string TrueLink(this String str, string domen="")
        {
            if (str.StartsWith("http")) return str;

            if (Regex.IsMatch(str, @"^\/{2}"))
                str = "http:" + str;
            else
            {
                str = Regex.Replace(str, @"^\/", "");
                domen = Regex.Replace(domen, @"\/$", "");
                str = domen + "/" + str;
            }
            return str;
        }

        /// <summary>
        /// Используем стоп-слово для строки
        /// </summary>
        public static string LastStopWord(this String str, string stop)
        {
            var index = str.LastIndexOf(stop, StringComparison.Ordinal);
            return index <= -1 ? str : str.Substring(0, index);
        }

        /// <summary>
        /// Используем последнее вхождение старт-слово  для строки
        /// </summary>
        public static string LastStartWord(this String str, string stop)
        {
            var index = str.LastIndexOf(stop, StringComparison.Ordinal);
            return index <= -1 ? str : str.Substring(index + stop.Length);
        }

        /// <summary>
        /// Используем стоп-слово для строки
        /// </summary>
        public static string RemoveTagAttr(this String str, string replaceItems)
        {
            return SplitToNextMark(replaceItems).Aggregate(str, (current, item) => Regex.Replace(current, item + @"=(""[^""]*""|'[^']*')", ""));
        }

        /// <summary>
        /// Получаем домен из Product или Category
        /// </summary>
        /// <returns></returns>
        public static string GetDomen(Category category)
        {
            return "http://" + new Uri(category.SourceUrl).Host;
        }
        public static string GetDomen(Product product)
        {
            return "http://" + new Uri(product.Url).Host;
        }

        /// <summary>
        /// возвращает хешкод полученых данных,исключает отрицательное число
        /// сохраняет внутри класса переменную HashCod
        /// </summary>
        public static string GetHashCodeString(string item)
        {
            var hash = item.GetHashCode();

            if (hash < 0)
                hash = hash * -1;

            return hash.ToString(CultureInfo.InvariantCulture);
        }

        /// <summary>
        /// заменяет имя изображения на SKU продукта
        /// </summary>
        /// <param name="imgHref">имя изображения(надо для получения рассширения изображения)</param>
        /// <param name="idx">порядовый номер изображения</param>
        /// <returns></returns>
        public static string MakeImgName2Sku(string imgHref, int idx)
        {
            var expanImg = new Regex(@"\.(jpg|png|bmp|gif|jpeg)", RegexOptions.IgnoreCase).Match(imgHref).Value;//list must add
            if (string.IsNullOrEmpty(expanImg))
                expanImg = ".jpg";

            if (idx > 1)
            {
                return HashCod + "-" + idx.ToString(CultureInfo.InvariantCulture) + expanImg;
            }
            return HashCod + expanImg;
        }

        /// <summary>
        /// все ссылки на фото в одной строке через разделитель
        /// </summary>
        /// <param name="listImages"></param>
        /// <param name="delimeter">string delimeter</param>
        /// <returns>string allImages</returns>
        public static string GetAllImageLinksAsString(List<string> listImages, string delimeter)
        {
            string links = listImages.Aggregate(string.Empty, (current, img) => current + (delimeter + img));
            return links.Substring(delimeter.Length);
        }

        /// <summary>
        /// все имена фото в одной строке через разделитель
        /// </summary>
        /// <param name="listImages"></param>
        /// <param name="delimeter"></param>
        /// <param name="prefix"></param>
        /// <returns></returns>
        public static string GetAllImageNamesAsString(List<string> listImages, string delimeter, string prefix)
        {
            string names = listImages.Aggregate(string.Empty, (current, img) => current + (delimeter + prefix + img));
            return names.Substring(delimeter.Length);
        }

        /// <summary>
        /// Делим строку на части по заданому сепаратору
        /// </summary>
        /// <param name="input">строка ввода</param>
        /// <param name="separator"> по умолчанию [next]</param>
        /// <returns> коллекция List с результатами </returns>
        public static List<string> SplitToNextMark(string input, string separator = "[next]")
        {
            var parts = input.Split(new[] { separator }, StringSplitOptions.RemoveEmptyEntries).ToList();
            return parts;
        }

        /// <summary>
        /// Меняем разделители в стандартном urlrewrite
        /// </summary>
        /// <param name="str"></param>
        /// <param name="delim"></param>
        /// <returns></returns>
        public static string UrlRewrite(string str, string delim)
        {
            str = new Regex(@"_+").Replace(AttributableItem.UrlRewriteGet(str), delim);
            str = new Regex(string.Format("^{0}|{0}$", delim)).Replace(str, "");
            return str;
        }

        /// <summary>
        /// Делим строку на части по заданому сепаратору
        /// </summary>
        /// <param name="xpath"></param>
        /// <param name="separator">по умолчанию [attr]</param>
        /// <returns>Одна пара с результатами</returns>
        public static KeyValuePair<string, string> GetPairsXpath2Attr(string xpath, string separator = "[attr]")
        {
            var pair = xpath.Split(new[] { separator }, StringSplitOptions.RemoveEmptyEntries);
            return new KeyValuePair<string, string>(pair[0], (pair.Length > 1) ? pair[1] : string.Empty);
        }

        /// <summary>
        /// Получаем имя из ссылки
        /// </summary>
        /// <param name="href"></param>
        /// <returns></returns>
        public static string CreateNameFromHref(string href)
        {
            var name = Regex.Replace(href, @"^[\w\W]*/", string.Empty);
            return name;
        }

        /// <summary>
        /// доработаная обертка TextUtils.ExtractValuesByXpath с использованием [next] и [attr]
        /// </summary>
        /// <param name="htmlDoc">HtmlAgilityPack.HtmlDocument</param>
        /// <param name="xpath">"xpath name[attr]xpath attribute [next] path name[attr]xpath attribute"</param>
        /// <returns></returns>
        public static List<string> ExtractValuesByXpath(HtmlDocument htmlDoc, string xpath)
        {
            foreach (var pair in SplitToNextMark(xpath).Select(part => GetPairsXpath2Attr(part)))
            {
                List<string> results;
                TextUtils.ExtractValuesByXpath(pair.Key, pair.Value, htmlDoc, true, out results);

                if (results != null && results.Count > 0)
                    return results;
            }
            return new List<string>(0);
        }

        /// <summary>
        /// доработаная обертка TextUtils.ExtractFirstValuesByXpath с использованием [next] и [attr]
        /// </summary>
        /// <param name="htmlDoc">HtmlAgilityPack.HtmlDocument</param>
        /// <param name="xpath">"xpath name[attr]xpath attribute [next] path name[attr]xpath attribute"</param>
        /// <returns>string</returns>
        public static string ExtractFirstValueByXpath(HtmlDocument htmlDoc, string xpath)
        {
            foreach (var pair in SplitToNextMark(xpath).Select(part => GetPairsXpath2Attr(part)))
            {
                var result = TextUtils.ExtractFirstValueByXpath(pair.Key, pair.Value, htmlDoc);

                if (!string.IsNullOrEmpty(result))
                    return result;
            }
            return string.Empty;
        }
    
        public static void HtmlRestore(this HtmlPageLoader hpl)
        {
            var restoreHtml = new RestoreHtml(hpl);
            hpl.SetContent(restoreHtml.ProcessingLiTag(hpl.Content));
        }
    
        class RestoreHtml
        {
            private readonly HtmlPageLoader _hpl;
            public RestoreHtml(HtmlPageLoader hpl)
            {
                _hpl = hpl;
            }

            public string ProcessingLiTag(string temp)
            {
                var text = string.Empty;
                _hpl.SetContent(temp);
                foreach (var item in ExtractValuesByXpath(_hpl.HtmlDoc, "/li"))
                {
                    temp = item;
                    while (true)
                    {
                        _hpl.SetContent(temp);
                        var innerData = ExtractFirstValueByXpath(_hpl.HtmlDoc, "/li");

                        if (!string.IsNullOrEmpty(innerData))
                        {
                            var ind = temp.IndexOf(innerData, StringComparison.Ordinal);
                            if (ind <= -1) break;
                            var res = temp.Substring(0, ind - 4);
                            text += "<li>" + res + "</li>";
                            temp = innerData;
                        }
                        else
                        {
                            text += "<li>" + temp + "</li>";
                            break;
                        }
                    }
                }
                return "<ul>" + text + "</ul>";
            }
        }
    }
    #endregion

    #region class ParametrsProccessing
    class ParametrsProcessing : CustomScript
    {
        private readonly HtmlPageLoader _hpl;
        private readonly Product _product;
        private bool _translate;
        private readonly string _htmlParameters;

        public CmsEngine Cms { get; set; }
        public List<string> NotTakedName { get; set; }

        public ParametrsProcessing(Product product, HtmlPageLoader hpl, string htmlParameters, bool translate = false)
            : base(product)
        {
            _translate = translate;
            _htmlParameters = htmlParameters;
            _hpl = hpl;
            _product = product;
        }

        /// <summary>
        /// Получаем характеристики с помощью xpath
        /// </summary>
        /// <param name="split">xPath разделитель</param>
        /// <param name="name">xPath имя свойства</param>
        /// <param name="value">xPath значение свойства</param>
        public void GetDinamicAttributesXpath(string split, string name, string value)
        {
            _hpl.SetContent(_htmlParameters);
            List<string> lines;
            TextUtils.ExtractValuesByXpath(split, "", _hpl.HtmlDoc, true, out lines);

            foreach (var line in lines)
            {
                _hpl.SetContent(line.Replace("\r", "").Replace("\t", "").Replace("\n", ""));
                var newColumnName = Helper.ExtractFirstValueByXpath(_hpl.HtmlDoc, name);
                var attributeValue = Helper.ExtractFirstValueByXpath(_hpl.HtmlDoc, value);

                if (!string.IsNullOrWhiteSpace(newColumnName) && !string.IsNullOrWhiteSpace(attributeValue))
                    AddAttibute(newColumnName, attributeValue);
            }
        }

        /// <summary>
        /// Получаем характеристики с помощью Regex
        /// </summary>
        /// <param name="split">Regex разделитель</param>
        /// <param name="name">Regex имя свойства</param>
        /// <param name="value">Regex значение свойства</param>
        public void GetDinamicAttributesRegex(string split, string name, string value)
        {
            _hpl.SetContent(_htmlParameters);
            var lines = _hpl.Content.Split(new [] { split }, StringSplitOptions.RemoveEmptyEntries);
            foreach (var line in lines)
            {
                var val = line.Replace("\r", "").Replace("\t", "").Replace("\n", "");
                var columnName = Regex.Match(val, name).Groups[1].Value;
                var attributeValue = Regex.Match(val, value).Groups[1].Value;
                AddAttibute(columnName, attributeValue);
            }
        }

        private void AddAttibute(string columnName, string attrValue)
        {
            var regex = new Regex(@"<[^>]*>");
            columnName = regex.Replace(columnName, "").Replace(":", "").Replace(";", ",").Replace("\"", "''").Trim();
            attrValue = regex.Replace(attrValue, "").Replace("\"", "''").Replace(";", ",").Trim();
            if (!CheckNameProperties(columnName) && !string.IsNullOrWhiteSpace(columnName) && !string.IsNullOrWhiteSpace(attrValue))
                _product.AddDynamicAttribute(columnName, attrValue);
        }

// ReSharper disable once UnusedMember.Local
        private void AddParametrsFromProduct()
        {
            var properties = string.Empty;
            if (Cms == CmsEngine.Prestashop)
            {
                properties = _product.GetDynamicAttributes().Aggregate(properties, (current, property) => current + string.Format(",{0}:{1}", property.Key, _product.GetAttributeValue(property.Value)));
                _product.SetAttributeValue("SYS_PROPERTIES_PRESTA", properties.Trim(','));
            }

            if (Cms != CmsEngine.Advatshop) return;
            properties = _product.GetDynamicAttributes().Aggregate(properties, (current, property) => current + string.Format(",{0}:{1}", property.Key, _product.GetAttributeValue(property.Value)));
            _product.SetAttributeValue("SYS_PARAMETRS_ADVANTSHOP", properties.Trim(','));
        }

        private bool CheckNameProperties(string name)
        {
            if (NotTakedName == null || NotTakedName.Count < 1)
                return false;
            return NotTakedName.Contains(name);
        }

        /// <summary>
        /// С помощью xpath получаем коллекцию с данными
        /// </summary>
        /// <param name="xpathSplit"> xpath блока по которому делаем Split</param>
        /// <param name="xpathData">xpath параметров , пишем в строку пример "//a[attr]href[next]//div[attr]class"</param>
        /// <returns></returns>
        public List<List<string>> GetDataFromXpath(string xpathSplit, string xpathData)
        {
            _hpl.SetContent(_htmlParameters);
            var list = new List<List<string>>();
            List<string> lines;
            TextUtils.ExtractValuesByXpath(xpathSplit, "", _hpl.HtmlDoc, true, out lines);
            foreach (var line in lines)
            {
                var resultLine = new List<string>();
                _hpl.SetContent(line);
                foreach (var xpath in GetArrayFromDelim("[next]", xpathData))
                {
                    string[] xp = GetArrayFromDelim("[attr]", xpath);
                    List<string> res;
                    TextUtils.ExtractValuesByXpath(xp[0], (xp.Length > 1) ? xp[1] : string.Empty, _hpl.HtmlDoc, false, out res);
                    resultLine.Add(res.Count > 0 ? res[0] : string.Empty);
                }
                list.Add(resultLine);
            }
            return list;
        }


        private string[] GetArrayFromDelim(string delim, string input)
        {
            return input.Split(new [] { delim }, StringSplitOptions.None);
        }
    }

    enum CmsEngine
    {
        Prestashop,
        Advatshop
    }
    #endregion

    #region class ImageProcessing
    public class ImageProcessing : CustomScript
    {
        private readonly string _imagesBlock;
        private readonly bool _imageNameSku;
        private readonly string _domen;
        private readonly HtmlPageLoader _hpl;
        private readonly Product _product;
        private int _idx;
        private readonly int _countImg;


        /// <summary>
        /// </summary>
        /// <param name="product"></param>
        /// <param name="domen"></param>
        /// <param name="imagesBlock">block with image code</param>
        /// <param name="hpl"></param>
        /// <param name="imageNameSku"></param>
        public ImageProcessing(Product product, HtmlPageLoader hpl, bool imageNameSku, string domen, string imagesBlock) : this(product, hpl, imageNameSku, domen, imagesBlock, 0) { }

        public ImageProcessing(Product product, HtmlPageLoader hpl, bool imageNameSku, string domen, string imagesBlock, int countImg)
            : base(product)
        {
            _product = product;
            _hpl = hpl;
            _domen = domen;
            _imagesBlock = imagesBlock;
            _imageNameSku = imageNameSku;
            if (countImg > 0)
                _countImg = countImg;
            _idx = product.ImagesCount + 1;
        }

        public void GetAdditionalImages(string split, string contains, string reg)
        {
            var listSource = GetSourceImagesWithRegex(_imagesBlock, split, contains, reg);
            if (listSource.Count > 0)
                AddImage(listSource);
        }

        public void GetAdditionalImages(string xpath)
        {
            _hpl.SetContent(_imagesBlock);
            var listSource = Helper.ExtractValuesByXpath(_hpl.HtmlDoc, xpath);
            if (listSource.Count > 0)
                AddImage(listSource);
        }

        public static List<string> GetSourceImagesWithRegex(string blockAddImage, string split, string contains, string reg)
        {
            var parts = blockAddImage.Split(new[] { split }, StringSplitOptions.RemoveEmptyEntries);
            return (
                from part in parts 
                where contains != null && !(!string.IsNullOrEmpty(contains) & !part.Contains(contains)) 
                from r in Helper.SplitToNextMark(reg) 
                select new Regex(r).Match(part).Groups[1].ToString().Trim() 
                into item 
                where !string.IsNullOrWhiteSpace(item) 
                select item
                ).ToList();
        }

        private void AddImage(List<string> listSource)
        {
            if (_countImg > 0 && listSource.Count > _countImg)
                RemoveItemToList(ref listSource);

            foreach (var item in listSource)
            {
                if (string.IsNullOrWhiteSpace(item))
                    continue;
                var linkImg = item.TrueLink(_domen);

                if (linkImg == _product.ImageFull)
                    continue;

                if (string.IsNullOrEmpty(_product.ImageFull))
                    _idx = 1;

                if (_imageNameSku)
                {
                    _product.ImageAdd(linkImg, GetNameImageFromHref(item, _idx));
                    _idx++;
                }
                else
                    _product.ImageAdd(linkImg);
            }
        }

        private void RemoveItemToList(ref List<string> listSource)
        {
            listSource.RemoveRange(_countImg, listSource.Count - _countImg);
        }


        /// <summary>
        /// Get expansion image and delegate him MakeImgName2Sku
        /// </summary>
        /// <param name="imgHref">path to image</param>
        /// <param name="idx"></param>
        /// <returns>string image from HashCod</returns>
        public string GetNameImageFromHref(string imgHref, int idx)
        {
            return Helper.MakeImgName2Sku(imgHref, idx);
        }
    }
    #endregion

    #region class WorksWithFiles

    class WorksWithFiles
    {
        public string PathToDir { get; private set; }
        public string PathToFile { get; private set; }
        public string NameFile { get; private set; }
        public string ExtenFile { get; private set; }

        private readonly string[] _lines;
        private readonly bool _notFile;

        public WorksWithFiles(string file) : this(file, null) { }
        public WorksWithFiles(string file, Encoding encoding)
        {
            GetValueFileVariables(file);
            if (ExtenFile != ".xlsx" && ExtenFile != ".xls")
            {
                if (!File.Exists(PathToFile))
                {
                    _notFile = true;
                    _lines = new string[0];
                }
                else
                {
                    _lines = encoding != null ? File.ReadAllLines(PathToFile, encoding) : File.ReadAllLines(PathToFile);
                }
            }
        }

        /// <summary>
        /// Создаем файл для закачки фото, для старой версии Catalogloader
        /// Вызываем функцию для каждой фотки
        /// Имя фото создает из ссылки
        /// </summary>
        /// <param name="path">Путь к файлу</param>
        /// <param name="imageLink">ссылка на фото</param>
        public static void CreateImageDownloadFile(string path, string imageLink)
        {
            var text = string.Format("{0};{1}", imageLink, Helper.CreateNameFromHref(imageLink));
            CreateTextFile(path, text);
        }

        /// <summary>
        /// Создаем текстовый файл
        /// Запись посторочно, при каждом вызове функции записывается одна строка
        /// </summary>
        /// <param name="path">путь к файлу</param>
        /// <param name="text"> текст </param>
        public static void CreateTextFile(string path, string text)
        {
            CreateTextFile(path, text, Encoding.UTF8);
        }

        /// <summary>
        /// Создаем текстовый файл
        /// Запись посторочно, при каждом вызове функции записывается одна строка
        /// </summary>
        /// <param name="path">путь к файлу</param>
        /// <param name="text"> текст </param>
        /// <param name="encoding"> кодировка файла </param>
        public static void CreateTextFile(string path, string text, Encoding encoding)
        {
            var startupPath = UtilSmall.ApplicationDataDirectory;
            path = path.Replace("/", "\\");
            if (!path.Contains(startupPath))
            {
                if (!Regex.IsMatch(path, @"^\\"))
                    path = "\\" + path;
                path = startupPath + path;
            }

            if (encoding == null)
                encoding = Encoding.UTF8;

            if (!File.Exists(path))
                using (var str = new StreamWriter(new FileStream(path, FileMode.Create), encoding))
                {
                    str.WriteLine(text);
                }
            else
                using (var str = new StreamWriter(new FileStream(path, FileMode.Append), encoding))
                {
                    str.WriteLine(text);
                }
        }

        /// <summary>
        /// Содержимое List пишется через ; в одну строку
        /// </summary>
        /// <param name="path">путь к файлу</param>
        /// <param name="texts"> коллекция с данными</param>
        public static void CreateTextFile(string path, List<string> texts)
        {
            CreateTextFile(path, texts, Encoding.UTF8);
        }

        /// <summary>
        /// Содержимое List пишется через ; в одну строку
        /// </summary>
        /// <param name="path">путь к файлу</param>
        /// <param name="texts"> коллекция с данными</param>
        /// <param name="encoding"> кодировка файла</param>
        public static void CreateTextFile(string path, List<string> texts, Encoding encoding)
        {
            var text = texts.Aggregate(string.Empty, (current, t) => current + (";" + t.Trim()));
            text = text.Substring(1);
            CreateTextFile(path, text, encoding);
        }

        /// <summary>
        /// Создаеем коллекцию из файла xslx
        /// </summary>
        /// <returns></returns>
        public Dictionary<string, string> CreateCollectionFromExcel()
        {
            if (_notFile || ExtenFile != ".xlsx")
                return new Dictionary<string, string>(0);

            var dic = new Dictionary<string, string>();
            var sheet = GetParseXls();
            for (var i = 0; i <= sheet.LastRowNum; i++)
            {
                try
                {
                    dic[sheet.GetRow(i).GetCell(0).ToString().ToLowerInvariant().Trim()] = sheet.GetRow(i).GetCell(1).ToString().ToLowerInvariant().Trim();
                }
                catch
                {
                    break;
                }
            }
            return dic;
        }

        /// <summary>
        /// Из файла с 2 колонками , создаем коллекцию
        /// </summary>
        /// <param name="delimeter"> разделитель колонок из файла </param>
        /// <param name="reverse">если хотим использовать вторую колонку как ключ коллекции ставим true</param>
        /// <param name="valueString"> false - значения пар с одинаковыми ключами пропускаем, true- пишем их в строку с разделителем ,</param>
        public Dictionary<string, string> CreateCollection(string delimeter, bool reverse, bool valueString = false)
        {
            if (_notFile)
                return new Dictionary<string, string>(0);

            var dic = new Dictionary<string, string>();
            foreach (var line in _lines)
            {
                var parts = line.Split(new [] { delimeter }, StringSplitOptions.RemoveEmptyEntries);
                if (parts.Length <= 1) continue;
                var name = parts[0].Replace("\"", string.Empty).ToLowerInvariant().Trim();
                var val = parts[1].Replace("\"", string.Empty).ToLowerInvariant().Trim();
                var searchN = name;
                var searchV = val;
                if (reverse)
                {
                    searchV = name;
                    searchN = val;
                }

                string res;
                if (!dic.TryGetValue(searchN, out res))
                    dic.Add(searchN, searchV);
                else
                    if (valueString)
                        dic[searchN] = res + "," + searchV;
            }
            return dic;
        }

        /// <summary>
        /// Создаем коллекцию с ключем и List как значение
        /// </summary>
        /// <param name="delimeter">разделитель из файлв</param>
        /// <param name="columnIndex">номер колонки которую используем как значение</param>
        /// <returns></returns>
        public Dictionary<string, List<string>> CreateCollection(string delimeter, int columnIndex = 0)
        {
            if (_notFile)
                return new Dictionary<string, List<string>>(0);

            var dic = new Dictionary<string, List<string>>();
            foreach (var line in _lines)
            {
                var list = new List<string>();
                var parts = line.Split(new[] { delimeter }, StringSplitOptions.RemoveEmptyEntries);


                if (parts.Length < 1 || parts.Length - 1 < columnIndex)
                    continue;
                    
                if(string.IsNullOrEmpty(parts[columnIndex]))
                    continue;

                    
                for (var i = 0; i < parts.Length; i++)
                {
                    if (i == columnIndex)
                        continue;

                    var part = parts[i].Replace("\"", string.Empty).Trim();
                    list.Add(!string.IsNullOrWhiteSpace(part) && !list.Contains(part) ? part : string.Empty);
                }
                var name = parts[columnIndex].Replace("\"", string.Empty).Trim();

                List<string> res;
                if (!dic.TryGetValue(name, out res))
                    dic.Add(name, list);
            }
            return dic;
        }

        /// <summary>
        /// коллекция значений из файла
        /// </summary>
        /// <returns></returns>
        public List<string> CreateCollection()
        {
            if (_notFile)
                return new List<string>(0);

            var list = new List<string>();
            foreach (var l in _lines.Select(line => line.Replace("\"", string.Empty).Trim()).Where(l => !list.Contains(l)))
            {
                list.Add(l);
            }
            return list;
        }

        private void GetValueFileVariables(string file)
        {
            PathToFile = file;
            NameFile = Path.GetFileName(file);
            PathToDir = Path.GetDirectoryName(file);
            ExtenFile = Path.GetExtension(file);
        }

        /// <summary>
        /// Закачиваем файл
        /// </summary>
        /// <param name="linkFile">ссылка на файл</param>
        public void DownloadFile(string linkFile)
        {
            CreateDirs();
            var wc = new WebClient();
            wc.DownloadFile(linkFile, PathToFile);
        }

        /// <summary>
        /// Создаем не существующую директорию
        /// </summary>
        private void CreateDirs()
        {
            if (!Directory.Exists(PathToDir))
                Directory.CreateDirectory(PathToDir);
        }

        /// <summary>
        /// Парсим екселевский файл
        /// </summary>
        /// <returns></returns>
        private NPOI.SS.UserModel.ISheet GetParseXls()
        {
            if (ExtenFile == ".xlsx")
            {
                NPOI.XSSF.UserModel.XSSFWorkbook workbook;
                using (var file = new FileStream(PathToFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                {
                    workbook = new NPOI.XSSF.UserModel.XSSFWorkbook(file);
                }
                return workbook.GetSheetAt(0);
            }
            else
            {
                NPOI.HSSF.UserModel.HSSFWorkbook workbook;
                using (var file = new FileStream(PathToFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                {
                    workbook = new NPOI.HSSF.UserModel.HSSFWorkbook(file);
                }
                return workbook.GetSheetAt(0);
            }
        }
    }

    #endregion

    #region CreateGrabCatalog

    public class CreateGrabCatalog : CustomScript
    {
        private readonly Category _mroot;
        private readonly HtmlPageLoader _hplCatalog;
        private readonly GrabCatalogBuildScriptParameters _p;
        private readonly string _domen;

        public CreateGrabCatalog(Category mroot, HtmlPageLoader hplCatalog, GrabCatalogBuildScriptParameters p, string domen)
            : base(mroot)
        {
            _domen = domen;
            _mroot = mroot;
            _hplCatalog = hplCatalog;
            _p = p;
        }

        /// <summary>
        /// добаляем категорию в грабкаталог
        /// </summary>
        /// <param name="lCat"></param>
        /// <param name="id">ID категории родителя</param>
        /// <returns></returns>
        public void AddCategories(List<Category> lCat, string id)
        {
            foreach (var c in lCat)
            {
                AddCategory(c, id);
            }
        }

        public void AddCategory(Category c, string id)
        {
            var root = _mroot.FindCategoryById(id);

            if (c.SourceUrl == "#")
            {
                _p.Process.m_ti.AddLogError("The Category didn't add : url category = " + c.SourceUrl );
                return;
            }

            c.SourceUrl = c.SourceUrl.TrueLink(_domen);

            var cat = root.AddCategory(c.Name, true);
            cat.SourceUrl = c.SourceUrl;
            c.ID = cat.ID;
            _p.Process.m_ti.AddLogInfo("Added category level " + cat.Level + " '" + cat.GetFullName() + "', url " + cat.SourceUrl);
        }

        /// <summary>
        /// Собираем ссылки со страницы где имя категории text(), а ссылка атрибут 
        /// </summary>
        /// <param name="xpathName">xpath[attr]attr </param>
        /// <param name="xpathLink">xpath[attr]attr </param>
        /// <param name="root">категория родитель</param>
        /// <param name="htmlPage"></param>
        /// <returns></returns>
        public List<Category> CreateListCategory(string xpathName, string xpathLink, Category root, string htmlPage = null)
        {
            if (htmlPage == null)
                _hplCatalog.Load(root.SourceUrl);
            else
                _hplCatalog.SetContent(htmlPage);

            var lc = new List<Category>();

            var name = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathName);
            var link = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathLink);

            if (name.Count != link.Count || link.Count <= 0) return lc;
            lc.AddRange(name.Select((t, i) => new Category {Name = Regex.Replace(t, @"<[^>]*>", "").Trim(), SourceUrl = link[i]}));
            return lc;
        }

        /// <summary>
        /// Берем категории с одной страницы
        /// </summary>
        /// <param name="html"></param>
        /// <param name="xpathBlock"></param>
        /// <param name="xpathName"></param>
        /// <param name="xpathLink"></param>
        /// <param name="root"></param>
        /// <returns></returns>
        public Dictionary<Category, string> CreateDictionaryCategoryOnePage(string html, string xpathBlock, string xpathName, string xpathLink, Category root)
        {
            _hplCatalog.SetContent(html);
            var lc = new Dictionary<Category, string>();

            var name = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathName);
            var link = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathLink);
            var blockHtml = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathBlock);

            if (name.Count <= 0) return lc;
            for (var a = 0; a < name.Count; a++)
            {
                var cat = new Category {Name = name[a].Trim(), SourceUrl = (link.Count == name.Count) ? link[a] : _domen};
                lc.Add(cat, (blockHtml.Count > 0) ? blockHtml[a] : string.Empty);
            }
            return lc;
        }
    
        public Dictionary<Category, string> CreateDictionaryCategoryOnePage(string html, string xpathBlockAll, string xpathBlock, string xpathName, string xpathLink, Category root)
        {
            _hplCatalog.SetContent(html);
            var lc = new Dictionary<Category, string>();
            var blocks = Helper.ExtractValuesByXpath(_hplCatalog.HtmlDoc, xpathBlockAll);
            foreach (var block in blocks)
            {
                _hplCatalog.SetContent(block);
                var name = Helper.ExtractFirstValueByXpath(_hplCatalog.HtmlDoc, xpathName);
                var link = Helper.ExtractFirstValueByXpath(_hplCatalog.HtmlDoc, xpathLink);
                var blockHtml = Helper.ExtractFirstValueByXpath(_hplCatalog.HtmlDoc, xpathBlock);
                if(string.IsNullOrEmpty(name)) continue;
                var cat = new Category { Name = name, SourceUrl = !string.IsNullOrEmpty(link) ? link : _domen};
                lc.Add(cat, !string.IsNullOrEmpty(blockHtml) ? blockHtml : string.Empty);
            }
            return lc;
        }
    }

    #endregion

    #region HtmlData
    public class HtmlData
    {
        public string Text { get; set; }
        public string Value { get; set; }
        public string Title { get; set; }
        public string Alt { get; set; }
        public string Style { get; set; }
        public string Name { get; set; }
        public string Html { get; set; }
        public string Href { get; set; }
        public string Id { get; set; }
        public string Src { get; set; }
        public string Class { get; set; }
        public string OnClick { get; set; }

        public Dictionary<string, string> Attr2Value { get; set; }

        public HtmlData()
        {
            Attr2Value = new Dictionary<string, string>();
            Text = Value = Style = Name = Html = Href = Id = Src = Class = OnClick = Title = Alt= string.Empty;
        }

        public double ToDouble(string str = "")
        {
            if (!string.IsNullOrEmpty(str) && !Attr2Value.ContainsKey(str))
                new Exception("This attribute is not found");

            str = string.IsNullOrEmpty(str) ? Text : Attr2Value[str];
            str = Regex.Replace(str, @"\r|\t|\n|\s", "").Trim();
            str = Regex.Match(str, @"(\d+,*\d*)").Groups[1].Value.Trim(',');
            double res;
            if (double.TryParse(str, out res))
                return res;
            new Exception("This format is not suitable");
            return 0;
        }

        public int ToInt32(string str = "")
        {
            if (!string.IsNullOrEmpty(str) && !Attr2Value.ContainsKey(str))
                new Exception("This attribute is not found");

            str = string.IsNullOrEmpty(str) ? Text : Attr2Value[str];
            str = Regex.Replace(str, @"\r|\t|\n|\s", "").Replace(",", ".").Trim();
            str = Regex.Match(str, @"(\d+)").Groups[1].Value.Trim(',');
            int res;
            if (int.TryParse(str, out res))
                return res;
            new Exception("This format is not suitable");
            return 0;
        }


        /// <summary>
        /// Extact HtmlData object
        /// </summary>
        /// <param name="hpl"></param>
        /// <param name="xpath">not [attr]</param>
        /// <returns></returns>
        public static HtmlData ExtractFirstHtmlData(HtmlPageLoader hpl, string xpath)
        {
            return Helper.SplitToNextMark(xpath).Select(item => hpl.HtmlDoc.DocumentNode.SelectSingleNode(item)).Select(GetHtmlData).FirstOrDefault(hd => hd != null);
        }

        /// <summary>
        /// Extact HtmlData object
        /// </summary>
        /// <param name="hpl"></param>
        /// <param name="xpath">not [attr]</param>
        /// <returns></returns>
        public static List<HtmlData> ExtractListHtmlData(HtmlPageLoader hpl, string xpath)
        {
            var res = new List<HtmlData>();
            foreach (var xp in Helper.SplitToNextMark(xpath))
            {
                var htmlNodes = hpl.HtmlDoc.DocumentNode.SelectNodes(xp);

                if (htmlNodes == null) continue;

                res.AddRange(htmlNodes.Select(GetHtmlData).Where(htmlData => htmlData != null));
                if (res.Count > 0)
                    break;
            }
            return res;
            //return (from htmlNodes in Helper.SplitToNextMark(xpath).Select(item => hpl.HtmlDoc.DocumentNode.SelectNodes(item)) from htmlNode in htmlNodes select GetHtmlData(htmlNode)).ToList();
        }

        private static HtmlData GetHtmlData(HtmlNode htmlNode)
        {
            if (htmlNode == null)
                return new HtmlData();

            var hd = new HtmlData
            {
                Text = HttpUtility.HtmlDecode(htmlNode.InnerText),
                Html = HttpUtility.HtmlDecode(htmlNode.InnerHtml)
            };

            foreach (var pair in htmlNode.Attributes)
            {
                hd.Attr2Value.Add(pair.Name, pair.Value);
                var value = HttpUtility.HtmlDecode(pair.Value);
                switch (pair.Name)
                {
                    case "style":
                        hd.Style = value;
                        break;
                    case "name":
                        hd.Name = value;
                        break;
                    case "value":
                        hd.Value = value;
                        break;
                    case "href":
                        hd.Href = value;
                        break;
                    case "src":
                        hd.Src = value;
                        break;
                    case "id":
                        hd.Id = value;
                        break;
                    case "class":
                        hd.Class = value;
                        break;
                    case "onclick":
                        hd.OnClick = value;
                        break;
                    case "title":
                        hd.Title = value;
                        break;
                    case "alt":
                        hd.Alt = value;
                        break;
                }
            }
            return hd;
        }

    }
    #endregion
}

Leave a Reply

Your email address will not be published.