MyDataProvider » Blog » Universal ecommerce products info extractor – code sample

Universal ecommerce products info extractor – code sample

  • by

using CatalogLoader;
using CatalogLoaderCommon;
using Newtonsoft.Json.Linq;
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;

namespace CatalogLoaderCore._MyWebScrapers
{
    public class AIProductInfoScraper : CustomScriptBase
    {
        HtmlAgilityPack.HtmlDocument _htmlDoc = null;
        RunProductScriptParameters _p = null;
        Dictionary<object, string> _object2txt = new Dictionary<object, string>();
        string _attributeName_Current = "";
        string _attributeValue_Current = "";

        public override void RunProduct(RunProductScriptParameters p)
        {
            base.RunProduct(p);
            _p = p;

            _object2txt.Clear();

            var opl = p.Process as OneProductLoader;
            opl.m_ti.AddLogInfo("it is log from  AIProductInfoScraper!");

            opl.State.CacheGet().LoadPageFromCacheOrWeb(p.Product.Url, out string html, out string redirect);

            //p.Product.FullDescription = html;

            _htmlDoc = new HtmlAgilityPack.HtmlDocument();
            _htmlDoc.LoadHtml(html);

            var nH1 = _htmlDoc.DocumentNode.SelectSingleNode("//h1");
            if (nH1 != null)
            {
                p.Product.Name = nH1.InnerText;
            }

            Meta();
            Itemprop();
            General();
            FeaturesTable();
            FeaturesUl();
            Images();

            //https://www.dalauta.com/lt/ratlankiai/127468-jr-wheels-jr18-17x8-et35-blank-hyper-gray-5902211951544.html

            //p.Product.Html = content;
        }
        private void Meta()
        {
            var nsMeta = _htmlDoc.DocumentNode.SelectNodes("//meta");
            if (nsMeta != null)
            {
                foreach (var n in nsMeta)
                {
                    var pp = n.Attributes["property"];
                    if (pp != null)
                    {
                        string content = null;
                        var nContent = n.Attributes["content"];
                        if (nContent != null)
                        {
                            content = nContent.Value;
                        }

                        if (string.IsNullOrWhiteSpace(content))
                        {
                            continue;
                        }
                        var ppName = pp.Value.ToLower();
                        if (ppName == "og:title")
                        {
                            _p.Product.Name = content;
                        }
                        if (ppName == "og:image")
                        {
                            _p.Product.ImageAdd(content);
                        }
                        if (ppName == "og:description")
                        {
                            _p.Product.FullDescription = content;
                        }
                        if (ppName == "product:price:amount")
                        {
                            _p.Product.Price = content;
                        }
                        if (ppName == "product:price:currency")
                        {
                            _p.Product.Currency = content;
                        }
                    }
                }
            }
        }
        private void Itemprop()
        {
            foreach (var n in _htmlDoc.DocumentNode.Descendants())
            {
                var aitemprop = n.Attributes["itemprop"];
                if (aitemprop == null)
                {
                    continue;
                }
                var name = aitemprop.Value;
                var contentAttr = "content";
                if (name.Contains("availability"))
                {
                    contentAttr = "href";
                }
                var acontent = n.Attributes[contentAttr];
                if (acontent == null)
                {
                    continue;
                }
                var con = acontent.Value;
                if (string.IsNullOrWhiteSpace(name) || string.IsNullOrWhiteSpace(con))
                {
                    continue;
                }
                name = name.ToLower();
                if (name == "sku" || name == "code")
                {
                    _p.Product.Art = con;
                }
                if (name == "price")
                {
                    _p.Product.Price = con;
                }
                if (name.Contains("currency"))
                {
                    _p.Product.Currency = con;
                }
                if (name.Contains("availability"))
                {
                    //var nhref = n.Attributes["href"];
                    //if (nhref != null)
                    {
                        var qty = con.ToLower().Contains("instock") ? "1" : "0";
                        _p.Product.Quantity = qty;
                    }
                }
                if (name.Contains("brand"))
                {
                    _p.Product.Manufacturer = con;
                }
                if (name.Contains("mpn"))
                {
                    _p.Product.MNP = con;
                }
            }
        }
        private void General()
        {
            List<string> categories = new List<string>();
            var breadFound = false;

            foreach (var n in _htmlDoc.DocumentNode.Descendants())
            {
                foreach (var a in n.Attributes)
                {
                    _attributeName_Current = a.Name.ToLower();
                    _attributeValue_Current = a.Value != null ? a.Value.ToLower() : "";

                    if (!breadFound)
                    {
                        if (AttributeCurrentContains("bread"))
                        {
                            breadFound = true;

                            foreach (var bn in n.Descendants())
                            {
                                if (bn.ChildNodes.Count == 0)
                                {
                                    var cn = txt(bn);
                                    if (cn == null)
                                    {
                                        cn = "";
                                    }
                                    cn = cn.Trim();
                                    var cnLen = cn.Length;
                                    for (var i = 0; i < cnLen; i++)
                                    {
                                        var origCn = cn;

                                        cn = cn.Trim('>');
                                        cn = cn.Trim('-');
                                        cn = cn.Trim('|');
                                        cn = cn.Trim('/');
                                        cn = cn.Trim('\\');
                                        cn = cn.Trim();

                                        if (cn == origCn)
                                        {
                                            break;
                                        }
                                    }

                                    cn = cn.Trim();
                                    if (!string.IsNullOrWhiteSpace(cn))
                                    {
                                        categories.Add(cn);
                                    }
                                }
                            }
                        }
                    }

                    if (string.IsNullOrWhiteSpace(_p.Product.Price))
                    {
                        if (AttributeCurrentContains("price"))
                        {
                            var txtPrice = txt(n);
                            if (AttributableItem.DoubleGetFromString(txtPrice, out double price))
                            {
                                if (price > 0)
                                {
                                    _p.Product.Price = price.ToString();
                                }
                            }
                        }
                    }
                    if (string.IsNullOrWhiteSpace(_p.Product.FullDescription))
                    {
                        if (AttributeCurrentContains("description"))
                        {
                            _p.Product.FullDescription = txt(n);
                        }
                    }
                    if (string.IsNullOrWhiteSpace(_p.Product.Art))
                    {
                        if (AttributeCurrentContains("sku"))
                        {
                            var sku = txt(n);
                            if (sku != null && sku.Length > 4 && sku.Length < 16)
                            {
                                _p.Product.Art = sku;
                            }
                        }
                    }
                    if (string.IsNullOrWhiteSpace(_p.Product.Code))
                    {
                        if (AttributeCurrentContains("code"))
                        {
                            var sku = txt(n);
                            if (sku != null && sku.Length > 4 && sku.Length < 16)
                            {
                                _p.Product.Code = sku;
                            }
                        }
                    }
                    if (string.IsNullOrWhiteSpace(_p.Product.MNP))
                    {
                        if (AttributeCurrentContains("MPN"))
                        {
                            var sku = txt(n);
                            if (sku != null && sku.Length < 16)
                            {
                                _p.Product.MNP = sku;
                            }
                        }
                    }
                    if (string.IsNullOrWhiteSpace(_p.Product.BarCode))
                    {
                        if (AttributeCurrentContains("barcode") || AttributeCurrentContains("upc"))
                        {
                            var sku = txt(n);
                            if (sku != null && sku.Length < 16)
                            {
                                _p.Product.BarCode = sku;
                            }
                        }
                    }
                    if (string.IsNullOrWhiteSpace(_p.Product.IdShopDb))
                    {
                        if (AttributeCurrentContains("asin"))
                        {
                            var sku = txt(n);
                            if (sku != null && sku.Length < 16)
                            {
                                _p.Product.IdShopDb = sku;
                            }
                        }
                    }
                    //if (string.IsNullOrWhiteSpace(_p.Product.ID))
                    //{
                    //    if (AttributeCurrentContains("productid") || AttributeCurrentContains("product-id"))
                    //    {
                    //        var sku = txt(n);
                    //        if (sku != null && sku.Length < 16)
                    //        {
                    //            _p.Product.ad = sku;
                    //        }
                    //    }
                    //}
                    if (string.IsNullOrWhiteSpace(_p.Product.Quantity))
                    {
                        if (AttributeCurrentContains("avail") || AttributeCurrentContains("quantity") || AttributeCurrentContains("qty"))
                        {
                            _p.Product.Quantity = txt(n);
                        }
                    }
                    if (string.IsNullOrWhiteSpace(_p.Product.Currency))
                    {
                        if (AttributeCurrentContains("currency"))
                        {
                            _p.Product.Currency = txt(n);
                        }
                    }
                    if (string.IsNullOrWhiteSpace(_p.Product.Manufacturer))
                    {
                        if (AttributeCurrentContains("manufacturer"))
                        {
                            _p.Product.Manufacturer = txt(n);
                        }
                    }
                }
            }

            if (categories.Count > 0)
            {
                _p.Product.CategoryFullPathSaveTo = String.Join("->", categories);
            }
        }

        private void Images()
        {
            var imgs = _htmlDoc.DocumentNode.SelectNodes("//img[@itemprop='image']");
            if (imgs != null)
            {
                foreach (var i in imgs)
                {
                    var ascr = i.Attributes["src"];
                    if (ascr == null)
                    {
                        continue;
                    }
                    var scr = ascr.Value;
                    if (string.IsNullOrWhiteSpace(scr))
                    {
                        continue;
                    }
                    if (!scr.StartsWith("http", StringComparison.InvariantCultureIgnoreCase))
                    {
                        var uriBase = new Uri(_p.Product.Url, UriKind.Absolute);
                        var uriScr = new Uri(uriBase, scr);
                        scr = uriScr.ToString();
                    }
                    _p.Product.ImageAdd(scr);
                }
            }
        }

        private void FeaturesTable()
        {
            var nTables = _htmlDoc.DocumentNode.SelectNodes("//table");
            if (nTables != null)
            {
                foreach (var t in nTables)
                {
                    var trs = t.SelectNodes(".//tr");
                    if (trs != null)
                    {
                        foreach (var tr in trs)
                        {
                            var tds = tr.SelectNodes(".//td|.//th");
                            if (tds != null && tds.Count == 2)
                            {
                                var fname = TextUtils.GetPlainTextOnly(tds[0].InnerHtml);
                                var fval = TextUtils.GetPlainTextOnly(tds[1].InnerHtml);
                                if (!string.IsNullOrWhiteSpace(fname) && !string.IsNullOrWhiteSpace(fval))
                                {
                                    if (fname.Length < 50 && fval.Length < 50)
                                    {
                                        _p.Product.AddDynamicAttribute(fname, fval);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

        private void FeaturesUl()
        {
            var nUl = _htmlDoc.DocumentNode.SelectNodes("//ul");
            if (nUl != null)
            {
                foreach (var t in nUl)
                {
                    var lis = t.SelectNodes(".//li");
                    if (lis != null)
                    {
                        foreach (var li in lis)
                        {
                            if (li.ChildNodes.Count >= 2)
                            {
                                var vals = new List<string>();
                                foreach(var c in li.ChildNodes)
                                {
                                    var v = TextUtils.GetPlainTextOnly(c.InnerHtml);
                                    if (!string.IsNullOrWhiteSpace(v))
                                    {
                                        vals.Add(v);
                                    }
                                }
                                if (vals.Count < 2)
                                {
                                    continue;
                                }
                                var fname = vals[0];
                                var fval = vals[1];

                                if (!string.IsNullOrWhiteSpace(fname) && !string.IsNullOrWhiteSpace(fval))
                                {
                                    if (fname.Length < 50 && fval.Length < 50)
                                    {
                                        _p.Product.AddDynamicAttribute(fname, fval);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

        bool AttributeCurrentContains(string substring)
        {
            if (_attributeName_Current.IndexOf(substring, StringComparison.InvariantCultureIgnoreCase) >= 0 
                || 
                _attributeValue_Current.IndexOf(substring, StringComparison.InvariantCultureIgnoreCase) >= 0)
            {
                return true;
            }
            return false;
        }

        string txt(HtmlAgilityPack.HtmlNode n)
        {
            string r;
            if (_object2txt.TryGetValue(n, out r))
            {
                return r;
            }
            r = TextUtils.GetPlainTextOnly(n.InnerHtml);
            _object2txt[n] = r;
            return r;
        }

        void JsonFind(string htmlText)
        {
            //string htmlText = Resources.html;
            //string jsonPtn = @"\{(?:[^\{\}]|(?<o>\{)|(?<-o>\}))+(?(o)(?!))\}";
            //string input = htmlText.Substring(htmlText.IndexOf("redirectResponse="));
            //Match match = Regex.Matches(input, jsonPtn, RegexOptions.Multiline | RegexOptions.IgnoreCase)[0];
            //string jsonText = match.Groups[0].Value;
            //var jsonObj = JObject.Parse(jsonText);
        }
    }
}