Version complète: sur le forum Webmaster Hub : Explication d'un script
Webmaster Hub > Création et exploitation de Sites Internet > Les langages du Net > PHP
Calli
(nouveau post car nouveau code pour pas mélanger)

Salut je viens de trouver un script qui permet de parser du xhtml

mais les explication sont en anglais
serait il possible de commenter et expliquer le script

Que faut il paramettrer ou modifier pour parser la page www.machin.com ?


CODE
<?php

/**
* HTML/XML Parser Class
*
* This is a helper class that is used to parse HTML and XML. A unique feature of this parsing class
* is the fact that it includes support for innerHTML (which isn't easy to do).
*
* _AT_author Dennis Pallett
* _AT_copyright Dennis Pallett 2006
* _AT_package HTML_Parser
* _AT_version 1.0
*/

// Helper Class
// To parse HTML/XML
Class HTML_Parser {
// Private properties
var $_parser;
var $_tags = array();
var $_html;
var $output = array();
var $strXmlData;
var $_level = 0;
var $_outline;
var $_tagcount = array();
var $xml_error = false;
var $xml_error_code;
var $xml_error_string;
var $xml_error_line_number;

function get_html () {
return $this->_html;
}

function parse($strInputXML) {
$this->output = array();

// Translate entities
$strInputXML = $this->translate_entities($strInputXML);

$this->_parser = xml_parser_create ();
xml_parser_set_option($this->_parser, XML_OPTION_CASE_FOLDING, true);
xml_set_object($this->_parser,$this);
xml_set_element_handler($this->_parser, "tagOpen", "tagClosed");

xml_set_character_data_handler($this->_parser, "tagData");

$this->strXmlData = xml_parse($this->_parser,$strInputXML );

if (!$this->strXmlData) {
$this->xml_error = true;
$this->xml_error_code = xml_get_error_code($this->_parser);
$this->xml_error_string = xml_error_string(xml_get_error_code($this->_parser));
$this->xml_error_line_number = xml_get_current_line_number($this->_parser);
return false;
}

return $this->output;
}


function tagOpen($parser, $name, $attr) {
// Increase level
$this->_level++;

// Create tag:
$newtag = $this->create_tag($name, $attr);

// Build tag
$tag = array("name"=>$name,"attr"=>$attr, "level"=>$this->_level);

// Add tag
array_push ($this->output, $tag);

// Add tag to this level
$this->_tags[$this->_level] = $tag;

// Add to HTML
$this->_html .= $newtag;

// Add to outline
$this->_outline .= $this->_level . $newtag;
}

function create_tag ($name, $attr) {
// Create tag:
# Begin with name
$tag = '<' . strtolower($name) . ' ';

# Create attribute list
foreach ($attr as $key=>$val) {
$tag .= strtolower($key) . '="' . htmlentities($val) . '" ';
}

# Finish tag
$tag = trim($tag);

switch(strtolower($name)) {
case 'br':
case 'input':
$tag .= ' /';
break;
}

$tag .= '>';

return $tag;
}

function tagData($parser, $tagData) {
if(trim($tagData)) {
if(isset($this->output[count($this->output)-1]['tagData'])) {
$this->output[count($this->output)-1]['tagData'] .= $tagData;
} else {
$this->output[count($this->output)-1]['tagData'] = $tagData;
}
}

$this->_html .= htmlentities($tagData);
$this->_outline .= htmlentities($tagData);
}

function tagClosed($parser, $name) {
// Add to HTML and outline
switch (strtolower($name)) {
case 'br':
case 'input':
break;
default:
$this->_outline .= $this->_level . '</' . strtolower($name) . '>';
$this->_html .= '</' . strtolower($name) . '>';
}

// Get tag that belongs to this end
$tag = $this->_tags[$this->_level];
$tag = $this->create_tag($tag['name'], $tag['attr']);

// Try to get innerHTML
$regex = '%' . preg_quote($this->_level . $tag, '%') . '(.*?)' . preg_quote($this->_level . '</' . strtolower($name) . '>', '%') . '%is';
preg_match ($regex, $this->_outline, $matches);

// Get innerHTML
if (isset($matches['1'])) {
$innerhtml = $matches['1'];
}

// Remove level identifiers
$this->_outline = str_replace($this->_level . $tag, $tag, $this->_outline);
$this->_outline = str_replace($this->_level . '</' . strtolower($name) . '>', '</' . strtolower($name) . '>', $this->_outline);

// Add innerHTML
if (isset($innerhtml)) {
$this->output[count($this->output)-1]['innerhtml'] = $innerhtml;
}

// Fix tree
$this->output[count($this->output)-2]['children'][] = $this->output[count($this->output)-1];
array_pop($this->output);

// Decrease level
$this->_level--;
}

function translate_entities($xmlSource, $reverse =FALSE) {
static $literal2NumericEntity;

if (empty($literal2NumericEntity)) {
$transTbl = get_html_translation_table(HTML_ENTITIES);

foreach ($transTbl as $char => $entity) {
if (strpos('&"<>', $char) !== FALSE) continue;
$literal2NumericEntity[$entity] = '&#'.ord($char).';';
}
}

if ($reverse) {
return strtr($xmlSource, array_flip($literal2NumericEntity));
} else {
return strtr($xmlSource, $literal2NumericEntity);
}
}
}

// To be used like this
$parser = new HTML_Parser;
$output = $parser->parse($html);

print_r ($output);

?>
Anonymus
La réponse se situe en fin de page :
CITATION
// To be used like this
$parser = new HTML_Parser;
$output = $parser->parse($html);

print_r ($output);


Utiliser comme ceci :
CODE
$parser = new HTML_Parser;
$output = $parser->parse($html);

En passant le contenu de la page dans la variable $html, et la variable $output contiendra un tableau de ta page.
Ceci est une version "bas débit" de notre forum. Pour voir la version complète avec plus d'information, la mise en page et les images, veuillez cliquer ici.