public abstract class WatinDataAcquisitionService : DataAcquisitionService<br/>
{<br/>
/// <summary>
/// This method must be implemented by any scraping service that needs to
/// use WatiN.
/// </summary>
/// <param name="browser">A preinitialized <c>Browser</c> object
/// that one can use for scraping.</param>
/// <remarks>Do not pass the <c>browser</c> object into other
/// threads or asynchronous operations.</remarks>
public abstract void AcquireData(Browser browser, ILog log);<br/>
}<br/>
Browser
type (it can be IE
or FireFox
) as well as a link to the logger from the main service - this allows us to log the process from the main host.WatinDataAcquisitionService
:[ImportMany( typeof (WatinDataAcquisitionService))]<br/>
public WatinDataAcquisitionService[] WatinServices { get; set; }<br/>
plugins
subdirectory:cat = new DirectoryCatalog( "plugins" );<br/>
cc = new CompositionContainer(cat);<br/>
cc.ComposeParts( this );<br/>
DoWork()
method looks quite chic. Let's show it first:private void DoWork()<br/>
{<br/>
while ( true )<br/>
{<br/>
log.InfoFormat( "Found {0} WatiN services" , WatinServices.Length);<br/>
if (WatinServices.Length > 0)<br/>
using ( var browser = new IE())<br/>
{<br/>
browser.Visible = false ;<br/>
foreach ( var s in WatinServices)<br/>
{<br/>
using ( var timer = new MyTimer(s.GetType().FullName, log))<br/>
{<br/>
// prevent errors from bleeding through
try <br/>
{<br/>
s.AcquireData(browser, log);<br/>
}<br/>
catch (Exception ex)<br/>
{<br/>
log.Error(<br/>
string .Format( "WatiN service {0} threw an exception" , s.GetType().FullName),<br/>
ex);<br/>
}<br/>
}<br/>
}<br/>
}<br/>
// do some work, then
Thread.Sleep(pollingFrequency);<br/>
}<br/>
}<br/>
Export
attribute. Like this:[Export( typeof (WatinDataAcquisitionService))]<br/>
public class PokemonService : WatinDataAcquisitionService<br/>
{<br/>
public override void AcquireData(Browser browser, ILog log)<br/>
{<br/>
log.Info( "Pokemon service running" );<br/>
browser.GoTo( "http://www.pokemon.com" );<br/>
var doc = new HtmlDocument();<br/>
doc.LoadHtml(browser.Body.OuterHtml);<br/>
var h3 = doc.DocumentNode.SelectNodes( "//h3" ).First();<br/>
log.Info(h3.InnerText);<br/>
}<br/>
}<br/>
plugins
daddy and everything will work. Danger, Will Robinson: dependencies, too, need to be copied to this folder or do ILmerge (the second is preferable).2*Environment.ProcessorCount
copies and everything is more or less working.StaTaskScheduler
that will create STA threads instead of MTA. Fortunately, this solution was already on the network ( on MSDN ), and I put it in the examples. Here is an example of how you can run 4 copies of IE each time:var po = new ParallelOptions();<br/>
po.TaskScheduler = new StaTaskScheduler(4);<br/>
Parallel.For(0, 100, po, x =><br/>
{<br/>
using ( var browser = new IE( "http://news.bbc.co.uk" ))<br/>
{<br/>
browser.Visible = false ;<br/>
var doc = new HtmlDocument();<br/>
doc.LoadHtml(browser.Body.OuterHtml);<br/>
var h3 = doc.DocumentNode.SelectNodes( "//h3" ).First();<br/>
Console.WriteLine(h3.InnerText);<br/>
}<br/>
});<br/>
Source: https://habr.com/ru/post/94960/
All Articles