# -*- coding: utf-8 -*- import grab import rdflib from rdflib import * from rdflib import plugin plugin.register( 'sparql', rdflib.query.Processor, 'rdfextras.sparql.processor', 'Processor') plugin.register( 'sparql', rdflib.query.Result, 'rdfextras.sparql.query', 'SPARQLQueryResult')
class SQtHD(): ''' sparql query to html documents ''' def __init__(self,url=None,htmlNamespace='http://localhost/rdf/html#'): ''' Constructor ''' self.__grab__=grab.Grab()# self.__storage__=Graph()# self.__namespace__=Namespace(htmlNamespace)# self.__storage__.bind('html', URIRef(htmlNamespace))# self.__initnamespace__=dict(self.__storage__.namespace_manager.namespaces()) if url:# , . self.__store__(url)
def __store__(self,url): self.__storage__.remove((None,None,None))# self.__grab__.go(url)# grab root=self.__grab__.tree.getroottree().getroot() self.__parse__(root)# .
def __parse__(self,element,parent=None,children_position=None,children_level=0): current_element=BNode() children_elements=element.getchildren() if str(element.tag)=='<built-in function Comment>': self.__storage__.add((current_element, RDF.type, self.__namespace__['comment'])) else: self.__storage__.add((current_element, RDF.type, self.__namespace__[element.tag])) if not parent==None: self.__storage__.add((current_element,self.__namespace__['parent'],parent)) self.__storage__.add((parent,self.__namespace__['children'], current_element)) self.__storage__.add((current_element,self.__namespace__['children_position'], Literal(children_position))) self.__storage__.add((current_element,self.__namespace__['children_level'], Literal(children_level))) if element.text and len(element.text.strip())>0: self.__storage__.add((current_element,self.__namespace__['text'], Literal(element.text.strip()))) if element.text_content() and len(element.text_content().strip())>0: self.__storage__.add((current_element,self.__namespace__['text_content'], Literal(element.text_content().strip()))) self.__storage__.add((current_element,self.__namespace__['children_count'], Literal(len(children_elements)))) for i in element.attrib: self.__storage__.add((current_element,self.__namespace__[i], Literal(element.attrib[i]))) for i in range(len(children_elements)): self.__parse__(children_elements[i],current_element,i,children_level+1)
def executeQuery(self,query,url=None): ''' execute query on storadge ''' if url:# , . self.__store__(url) return self.__storage__.query(query, initNs=self.__initnamespace__)# .
def loadStoradge(self,url): ''' load and parse html page to local rdf storadge ''' self.__store__(url)
if __name__ == "__main__": endPoint = SQtHD()# SQtHD endPoint.loadStoradge('http://habrahabr.ru')# print "All sources for images given by tag <img>:"# q=endPoint.executeQuery('SELECT DISTINCT ?src { ?a rdf:type html:img. ?a html:src ?src. }') for row in q.result: print row print print "All link urls:"# q=endPoint.executeQuery('SELECT DISTINCT ?href { ?a rdf:type html:a. ?a html:href ?href. }') for row in q.result: print row print print "All class names for elements:"# q=endPoint.executeQuery('SELECT DISTINCT ?class { ?a html:class ?class. }') for row in q.result: print row print ''' print "All scripts (without loaded by src):"# . q=endPoint.executeQuery('SELECT ?text { ?a rdf:type html:script. ?a html:text ?text. }') for row in q.result: print row print''' print "All script srcs:"# . q=endPoint.executeQuery('SELECT ?src { ?a rdf:type html:script. ?a html:src ?src. }') for row in q.result: print row print
All script srcs: /javascripts/1341931979/all.js /javascripts/1341931979/_parts/posts.js /javascripts/1341931979/_parts/to_top.js /javascripts/1341931979/_parts/shortcuts.js /javascripts/1341931979/libs/jquery.form.js /javascripts/1341931979/facebook_reader.js /js/1341931979/adriver.core.2.js /javascripts/1341931979/libs/highlight.js /javascripts/1341931979/hubs/all.js /javascripts/1341931979/posts/all.js
Source: https://habr.com/ru/post/147579/
All Articles