MyDataProvider » Blog » Universal ecommerce products info extractor – code sample

Universal ecommerce products info extractor – code sample

  • by

[code lang=”csharp”]

using CatalogLoader;
using CatalogLoaderCommon;
using Newtonsoft.Json.Linq;
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;

namespace CatalogLoaderCore._MyWebScrapers
{
public class AIProductInfoScraper : CustomScriptBase
{
HtmlAgilityPack.HtmlDocument _htmlDoc = null;
RunProductScriptParameters _p = null;
Dictionary<object, string> _object2txt = new Dictionary<object, string>();
string _attributeName_Current = "";
string _attributeValue_Current = "";

public override void RunProduct(RunProductScriptParameters p)
{
base.RunProduct(p);
_p = p;

_object2txt.Clear();

var opl = p.Process as OneProductLoader;
opl.m_ti.AddLogInfo("it is log from AIProductInfoScraper!");

opl.State.CacheGet().LoadPageFromCacheOrWeb(p.Product.Url, out string html, out string redirect);

//p.Product.FullDescription = html;

_htmlDoc = new HtmlAgilityPack.HtmlDocument();
_htmlDoc.LoadHtml(html);

var nH1 = _htmlDoc.DocumentNode.SelectSingleNode("//h1");
if (nH1 != null)
{
p.Product.Name = nH1.InnerText;
}

Meta();
Itemprop();
General();
FeaturesTable();
FeaturesUl();
Images();

//https://www.dalauta.com/lt/ratlankiai/127468-jr-wheels-jr18-17×8-et35-blank-hyper-gray-5902211951544.html

//p.Product.Html = content;
}
private void Meta()
{
var nsMeta = _htmlDoc.DocumentNode.SelectNodes("//meta");
if (nsMeta != null)
{
foreach (var n in nsMeta)
{
var pp = n.Attributes["property"];
if (pp != null)
{
string content = null;
var nContent = n.Attributes["content"];
if (nContent != null)
{
content = nContent.Value;
}

if (string.IsNullOrWhiteSpace(content))
{
continue;
}
var ppName = pp.Value.ToLower();
if (ppName == "og:title")
{
_p.Product.Name = content;
}
if (ppName == "og:image")
{
_p.Product.ImageAdd(content);
}
if (ppName == "og:description")
{
_p.Product.FullDescription = content;
}
if (ppName == "product:price:amount")
{
_p.Product.Price = content;
}
if (ppName == "product:price:currency")
{
_p.Product.Currency = content;
}
}
}
}
}
private void Itemprop()
{
foreach (var n in _htmlDoc.DocumentNode.Descendants())
{
var aitemprop = n.Attributes["itemprop"];
if (aitemprop == null)
{
continue;
}
var name = aitemprop.Value;
var contentAttr = "content";
if (name.Contains("availability"))
{
contentAttr = "href";
}
var acontent = n.Attributes[contentAttr];
if (acontent == null)
{
continue;
}
var con = acontent.Value;
if (string.IsNullOrWhiteSpace(name) || string.IsNullOrWhiteSpace(con))
{
continue;
}
name = name.ToLower();
if (name == "sku" || name == "code")
{
_p.Product.Art = con;
}
if (name == "price")
{
_p.Product.Price = con;
}
if (name.Contains("currency"))
{
_p.Product.Currency = con;
}
if (name.Contains("availability"))
{
//var nhref = n.Attributes["href"];
//if (nhref != null)
{
var qty = con.ToLower().Contains("instock") ? "1" : "0";
_p.Product.Quantity = qty;
}
}
if (name.Contains("brand"))
{
_p.Product.Manufacturer = con;
}
if (name.Contains("mpn"))
{
_p.Product.MNP = con;
}
}
}
private void General()
{
List<string> categories = new List<string>();
var breadFound = false;

foreach (var n in _htmlDoc.DocumentNode.Descendants())
{
foreach (var a in n.Attributes)
{
_attributeName_Current = a.Name.ToLower();
_attributeValue_Current = a.Value != null ? a.Value.ToLower() : "";

if (!breadFound)
{
if (AttributeCurrentContains("bread"))
{
breadFound = true;

foreach (var bn in n.Descendants())
{
if (bn.ChildNodes.Count == 0)
{
var cn = txt(bn);
if (cn == null)
{
cn = "";
}
cn = cn.Trim();
var cnLen = cn.Length;
for (var i = 0; i < cnLen; i++)
{
var origCn = cn;

cn = cn.Trim(‘>’);
cn = cn.Trim(‘-‘);
cn = cn.Trim(‘|’);
cn = cn.Trim(‘/’);
cn = cn.Trim(‘\\’);
cn = cn.Trim();

if (cn == origCn)
{
break;
}
}

cn = cn.Trim();
if (!string.IsNullOrWhiteSpace(cn))
{
categories.Add(cn);
}
}
}
}
}

if (string.IsNullOrWhiteSpace(_p.Product.Price))
{
if (AttributeCurrentContains("price"))
{
var txtPrice = txt(n);
if (AttributableItem.DoubleGetFromString(txtPrice, out double price))
{
if (price > 0)
{
_p.Product.Price = price.ToString();
}
}
}
}
if (string.IsNullOrWhiteSpace(_p.Product.FullDescription))
{
if (AttributeCurrentContains("description"))
{
_p.Product.FullDescription = txt(n);
}
}
if (string.IsNullOrWhiteSpace(_p.Product.Art))
{
if (AttributeCurrentContains("sku"))
{
var sku = txt(n);
if (sku != null && sku.Length > 4 && sku.Length < 16)
{
_p.Product.Art = sku;
}
}
}
if (string.IsNullOrWhiteSpace(_p.Product.Code))
{
if (AttributeCurrentContains("code"))
{
var sku = txt(n);
if (sku != null && sku.Length > 4 && sku.Length < 16)
{
_p.Product.Code = sku;
}
}
}
if (string.IsNullOrWhiteSpace(_p.Product.MNP))
{
if (AttributeCurrentContains("MPN"))
{
var sku = txt(n);
if (sku != null && sku.Length < 16)
{
_p.Product.MNP = sku;
}
}
}
if (string.IsNullOrWhiteSpace(_p.Product.BarCode))
{
if (AttributeCurrentContains("barcode") || AttributeCurrentContains("upc"))
{
var sku = txt(n);
if (sku != null && sku.Length < 16)
{
_p.Product.BarCode = sku;
}
}
}
if (string.IsNullOrWhiteSpace(_p.Product.IdShopDb))
{
if (AttributeCurrentContains("asin"))
{
var sku = txt(n);
if (sku != null && sku.Length < 16)
{
_p.Product.IdShopDb = sku;
}
}
}
//if (string.IsNullOrWhiteSpace(_p.Product.ID))
//{
// if (AttributeCurrentContains("productid") || AttributeCurrentContains("product-id"))
// {
// var sku = txt(n);
// if (sku != null && sku.Length < 16)
// {
// _p.Product.ad = sku;
// }
// }
//}
if (string.IsNullOrWhiteSpace(_p.Product.Quantity))
{
if (AttributeCurrentContains("avail") || AttributeCurrentContains("quantity") || AttributeCurrentContains("qty"))
{
_p.Product.Quantity = txt(n);
}
}
if (string.IsNullOrWhiteSpace(_p.Product.Currency))
{
if (AttributeCurrentContains("currency"))
{
_p.Product.Currency = txt(n);
}
}
if (string.IsNullOrWhiteSpace(_p.Product.Manufacturer))
{
if (AttributeCurrentContains("manufacturer"))
{
_p.Product.Manufacturer = txt(n);
}
}
}
}

if (categories.Count > 0)
{
_p.Product.CategoryFullPathSaveTo = String.Join("->", categories);
}
}

private void Images()
{
var imgs = _htmlDoc.DocumentNode.SelectNodes("//img[@itemprop=’image’]");
if (imgs != null)
{
foreach (var i in imgs)
{
var ascr = i.Attributes["src"];
if (ascr == null)
{
continue;
}
var scr = ascr.Value;
if (string.IsNullOrWhiteSpace(scr))
{
continue;
}
if (!scr.StartsWith("http", StringComparison.InvariantCultureIgnoreCase))
{
var uriBase = new Uri(_p.Product.Url, UriKind.Absolute);
var uriScr = new Uri(uriBase, scr);
scr = uriScr.ToString();
}
_p.Product.ImageAdd(scr);
}
}
}

private void FeaturesTable()
{
var nTables = _htmlDoc.DocumentNode.SelectNodes("//table");
if (nTables != null)
{
foreach (var t in nTables)
{
var trs = t.SelectNodes(".//tr");
if (trs != null)
{
foreach (var tr in trs)
{
var tds = tr.SelectNodes(".//td|.//th");
if (tds != null && tds.Count == 2)
{
var fname = TextUtils.GetPlainTextOnly(tds[0].InnerHtml);
var fval = TextUtils.GetPlainTextOnly(tds[1].InnerHtml);
if (!string.IsNullOrWhiteSpace(fname) && !string.IsNullOrWhiteSpace(fval))
{
if (fname.Length < 50 && fval.Length < 50)
{
_p.Product.AddDynamicAttribute(fname, fval);
}
}
}
}
}
}
}
}

private void FeaturesUl()
{
var nUl = _htmlDoc.DocumentNode.SelectNodes("//ul");
if (nUl != null)
{
foreach (var t in nUl)
{
var lis = t.SelectNodes(".//li");
if (lis != null)
{
foreach (var li in lis)
{
if (li.ChildNodes.Count >= 2)
{
var vals = new List<string>();
foreach(var c in li.ChildNodes)
{
var v = TextUtils.GetPlainTextOnly(c.InnerHtml);
if (!string.IsNullOrWhiteSpace(v))
{
vals.Add(v);
}
}
if (vals.Count < 2)
{
continue;
}
var fname = vals[0];
var fval = vals[1];

if (!string.IsNullOrWhiteSpace(fname) && !string.IsNullOrWhiteSpace(fval))
{
if (fname.Length < 50 && fval.Length < 50)
{
_p.Product.AddDynamicAttribute(fname, fval);
}
}
}
}
}
}
}
}

bool AttributeCurrentContains(string substring)
{
if (_attributeName_Current.IndexOf(substring, StringComparison.InvariantCultureIgnoreCase) >= 0
||
_attributeValue_Current.IndexOf(substring, StringComparison.InvariantCultureIgnoreCase) >= 0)
{
return true;
}
return false;
}

string txt(HtmlAgilityPack.HtmlNode n)
{
string r;
if (_object2txt.TryGetValue(n, out r))
{
return r;
}
r = TextUtils.GetPlainTextOnly(n.InnerHtml);
_object2txt[n] = r;
return r;
}

void JsonFind(string htmlText)
{
//string htmlText = Resources.html;
//string jsonPtn = @"\{(?:[^\{\}]|(?<o>\{)|(?<-o>\}))+(?(o)(?!))\}";
//string input = htmlText.Substring(htmlText.IndexOf("redirectResponse="));
//Match match = Regex.Matches(input, jsonPtn, RegexOptions.Multiline | RegexOptions.IgnoreCase)[0];
//string jsonText = match.Groups[0].Value;
//var jsonObj = JObject.Parse(jsonText);
}
}
}

[/code]