using CatalogLoader; using CatalogLoaderCommon; using Newtonsoft.Json.Linq; using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; namespace CatalogLoaderCore._MyWebScrapers { public class AIProductInfoScraper : CustomScriptBase { HtmlAgilityPack.HtmlDocument _htmlDoc = null; RunProductScriptParameters _p = null; Dictionary<object, string> _object2txt = new Dictionary<object, string>(); string _attributeName_Current = ""; string _attributeValue_Current = ""; public override void RunProduct(RunProductScriptParameters p) { base.RunProduct(p); _p = p; _object2txt.Clear(); var opl = p.Process as OneProductLoader; opl.m_ti.AddLogInfo("it is log from AIProductInfoScraper!"); opl.State.CacheGet().LoadPageFromCacheOrWeb(p.Product.Url, out string html, out string redirect); //p.Product.FullDescription = html; _htmlDoc = new HtmlAgilityPack.HtmlDocument(); _htmlDoc.LoadHtml(html); var nH1 = _htmlDoc.DocumentNode.SelectSingleNode("//h1"); if (nH1 != null) { p.Product.Name = nH1.InnerText; } Meta(); Itemprop(); General(); FeaturesTable(); FeaturesUl(); Images(); //https://www.dalauta.com/lt/ratlankiai/127468-jr-wheels-jr18-17x8-et35-blank-hyper-gray-5902211951544.html //p.Product.Html = content; } private void Meta() { var nsMeta = _htmlDoc.DocumentNode.SelectNodes("//meta"); if (nsMeta != null) { foreach (var n in nsMeta) { var pp = n.Attributes["property"]; if (pp != null) { string content = null; var nContent = n.Attributes["content"]; if (nContent != null) { content = nContent.Value; } if (string.IsNullOrWhiteSpace(content)) { continue; } var ppName = pp.Value.ToLower(); if (ppName == "og:title") { _p.Product.Name = content; } if (ppName == "og:image") { _p.Product.ImageAdd(content); } if (ppName == "og:description") { _p.Product.FullDescription = content; } if (ppName == "product:price:amount") { _p.Product.Price = content; } if (ppName == "product:price:currency") { _p.Product.Currency = content; } } } } } private void Itemprop() { foreach (var n in _htmlDoc.DocumentNode.Descendants()) { var aitemprop = n.Attributes["itemprop"]; if (aitemprop == null) { continue; } var name = aitemprop.Value; var contentAttr = "content"; if (name.Contains("availability")) { contentAttr = "href"; } var acontent = n.Attributes[contentAttr]; if (acontent == null) { continue; } var con = acontent.Value; if (string.IsNullOrWhiteSpace(name) || string.IsNullOrWhiteSpace(con)) { continue; } name = name.ToLower(); if (name == "sku" || name == "code") { _p.Product.Art = con; } if (name == "price") { _p.Product.Price = con; } if (name.Contains("currency")) { _p.Product.Currency = con; } if (name.Contains("availability")) { //var nhref = n.Attributes["href"]; //if (nhref != null) { var qty = con.ToLower().Contains("instock") ? "1" : "0"; _p.Product.Quantity = qty; } } if (name.Contains("brand")) { _p.Product.Manufacturer = con; } if (name.Contains("mpn")) { _p.Product.MNP = con; } } } private void General() { List<string> categories = new List<string>(); var breadFound = false; foreach (var n in _htmlDoc.DocumentNode.Descendants()) { foreach (var a in n.Attributes) { _attributeName_Current = a.Name.ToLower(); _attributeValue_Current = a.Value != null ? a.Value.ToLower() : ""; if (!breadFound) { if (AttributeCurrentContains("bread")) { breadFound = true; foreach (var bn in n.Descendants()) { if (bn.ChildNodes.Count == 0) { var cn = txt(bn); if (cn == null) { cn = ""; } cn = cn.Trim(); var cnLen = cn.Length; for (var i = 0; i < cnLen; i++) { var origCn = cn; cn = cn.Trim('>'); cn = cn.Trim('-'); cn = cn.Trim('|'); cn = cn.Trim('/'); cn = cn.Trim('\\'); cn = cn.Trim(); if (cn == origCn) { break; } } cn = cn.Trim(); if (!string.IsNullOrWhiteSpace(cn)) { categories.Add(cn); } } } } } if (string.IsNullOrWhiteSpace(_p.Product.Price)) { if (AttributeCurrentContains("price")) { var txtPrice = txt(n); if (AttributableItem.DoubleGetFromString(txtPrice, out double price)) { if (price > 0) { _p.Product.Price = price.ToString(); } } } } if (string.IsNullOrWhiteSpace(_p.Product.FullDescription)) { if (AttributeCurrentContains("description")) { _p.Product.FullDescription = txt(n); } } if (string.IsNullOrWhiteSpace(_p.Product.Art)) { if (AttributeCurrentContains("sku")) { var sku = txt(n); if (sku != null && sku.Length > 4 && sku.Length < 16) { _p.Product.Art = sku; } } } if (string.IsNullOrWhiteSpace(_p.Product.Code)) { if (AttributeCurrentContains("code")) { var sku = txt(n); if (sku != null && sku.Length > 4 && sku.Length < 16) { _p.Product.Code = sku; } } } if (string.IsNullOrWhiteSpace(_p.Product.MNP)) { if (AttributeCurrentContains("MPN")) { var sku = txt(n); if (sku != null && sku.Length < 16) { _p.Product.MNP = sku; } } } if (string.IsNullOrWhiteSpace(_p.Product.BarCode)) { if (AttributeCurrentContains("barcode") || AttributeCurrentContains("upc")) { var sku = txt(n); if (sku != null && sku.Length < 16) { _p.Product.BarCode = sku; } } } if (string.IsNullOrWhiteSpace(_p.Product.IdShopDb)) { if (AttributeCurrentContains("asin")) { var sku = txt(n); if (sku != null && sku.Length < 16) { _p.Product.IdShopDb = sku; } } } //if (string.IsNullOrWhiteSpace(_p.Product.ID)) //{ // if (AttributeCurrentContains("productid") || AttributeCurrentContains("product-id")) // { // var sku = txt(n); // if (sku != null && sku.Length < 16) // { // _p.Product.ad = sku; // } // } //} if (string.IsNullOrWhiteSpace(_p.Product.Quantity)) { if (AttributeCurrentContains("avail") || AttributeCurrentContains("quantity") || AttributeCurrentContains("qty")) { _p.Product.Quantity = txt(n); } } if (string.IsNullOrWhiteSpace(_p.Product.Currency)) { if (AttributeCurrentContains("currency")) { _p.Product.Currency = txt(n); } } if (string.IsNullOrWhiteSpace(_p.Product.Manufacturer)) { if (AttributeCurrentContains("manufacturer")) { _p.Product.Manufacturer = txt(n); } } } } if (categories.Count > 0) { _p.Product.CategoryFullPathSaveTo = String.Join("->", categories); } } private void Images() { var imgs = _htmlDoc.DocumentNode.SelectNodes("//img[@itemprop='image']"); if (imgs != null) { foreach (var i in imgs) { var ascr = i.Attributes["src"]; if (ascr == null) { continue; } var scr = ascr.Value; if (string.IsNullOrWhiteSpace(scr)) { continue; } if (!scr.StartsWith("http", StringComparison.InvariantCultureIgnoreCase)) { var uriBase = new Uri(_p.Product.Url, UriKind.Absolute); var uriScr = new Uri(uriBase, scr); scr = uriScr.ToString(); } _p.Product.ImageAdd(scr); } } } private void FeaturesTable() { var nTables = _htmlDoc.DocumentNode.SelectNodes("//table"); if (nTables != null) { foreach (var t in nTables) { var trs = t.SelectNodes(".//tr"); if (trs != null) { foreach (var tr in trs) { var tds = tr.SelectNodes(".//td|.//th"); if (tds != null && tds.Count == 2) { var fname = TextUtils.GetPlainTextOnly(tds[0].InnerHtml); var fval = TextUtils.GetPlainTextOnly(tds[1].InnerHtml); if (!string.IsNullOrWhiteSpace(fname) && !string.IsNullOrWhiteSpace(fval)) { if (fname.Length < 50 && fval.Length < 50) { _p.Product.AddDynamicAttribute(fname, fval); } } } } } } } } private void FeaturesUl() { var nUl = _htmlDoc.DocumentNode.SelectNodes("//ul"); if (nUl != null) { foreach (var t in nUl) { var lis = t.SelectNodes(".//li"); if (lis != null) { foreach (var li in lis) { if (li.ChildNodes.Count >= 2) { var vals = new List<string>(); foreach(var c in li.ChildNodes) { var v = TextUtils.GetPlainTextOnly(c.InnerHtml); if (!string.IsNullOrWhiteSpace(v)) { vals.Add(v); } } if (vals.Count < 2) { continue; } var fname = vals[0]; var fval = vals[1]; if (!string.IsNullOrWhiteSpace(fname) && !string.IsNullOrWhiteSpace(fval)) { if (fname.Length < 50 && fval.Length < 50) { _p.Product.AddDynamicAttribute(fname, fval); } } } } } } } } bool AttributeCurrentContains(string substring) { if (_attributeName_Current.IndexOf(substring, StringComparison.InvariantCultureIgnoreCase) >= 0 || _attributeValue_Current.IndexOf(substring, StringComparison.InvariantCultureIgnoreCase) >= 0) { return true; } return false; } string txt(HtmlAgilityPack.HtmlNode n) { string r; if (_object2txt.TryGetValue(n, out r)) { return r; } r = TextUtils.GetPlainTextOnly(n.InnerHtml); _object2txt[n] = r; return r; } void JsonFind(string htmlText) { //string htmlText = Resources.html; //string jsonPtn = @"\{(?:[^\{\}]|(?<o>\{)|(?<-o>\}))+(?(o)(?!))\}"; //string input = htmlText.Substring(htmlText.IndexOf("redirectResponse=")); //Match match = Regex.Matches(input, jsonPtn, RegexOptions.Multiline | RegexOptions.IgnoreCase)[0]; //string jsonText = match.Groups[0].Value; //var jsonObj = JObject.Parse(jsonText); } } }