Follow along with the video below to see how to install our site as a web app on your home screen.
Примечание: This feature may not be available in some browsers.
function get_description($file){
$html = file_get_contents("http://".$file);
preg_match('|<description>(.*)</description>|mi', $html, $result);
preg_match('/encoding="([^"]+)"/iUs', $html, $charset);
$result = iconv($charset[1], "utf8", $result[1]);
return $result;
}
function get_charset($file){
$html = file_get_contents("http://".$file);
preg_match('/encoding="([^"]+)"/iUs', $html, $charset);
return $charset[1];
}
function returnSubstrings($text, $openingMarker, $closingMarker) {
$openingMarkerLength = mb_strlen($openingMarker);
$closingMarkerLength = mb_strlen($closingMarker);
$result = array();
$position = 0;
while (($position = mb_strpos($text, $openingMarker, $position)) !== false) {
$position += $openingMarkerLength;
if (($closingMarkerPosition = mb_strpos($text, $closingMarker, $position)) !== false) {
$result[] = substr($text, $position, $closingMarkerPosition - $position);
$position = $closingMarkerPosition + $closingMarkerLength;
}
}
return $result;
}
/**
* Универсальный парсер. Передаем строку - темплейт типа бла-бла-бла{$название_параметра}бла-бла-бла.
* И контент. Получаем готовую ленту.
* @param string $str
* @param string $template
* @param mixed $result
*
* В темплейте мы вырезаем все нужные данные и ставим вместо них признаки - {$переменная}.
* Всякую шваль метим как {$all}.
* Но выходе получаем массив новостей типа - array of array ('переменная' => полученное значение)
*/
function html_reg_parse($str, $template, &$result) {
// Темплейт парсим на наличие переменных вида {$}
preg_match_all('|{\$(.*)}|Us', $template, $out);
$opers = $out[1];
$reg = "|".str_replace($out[0],'(.*)',$template)."|Us";
preg_match_all($reg, $str, $out);
for ($i=1; $i<=count($opers); $i++) {
for ($j=0; $j<count($out[$i]); $j++)
if (isset($result[$j][$opers[$i-1]]))
$result[$j][$opers[$i-1]] .= $out[$i][$j];
else
$result[$j][$opers[$i-1]] = $out[$i][$j];
}
}
Пример использования:
$content = ""; //XML!!!
$template = "<operation id=\"{$n1}\" ts=\"{$n2}\">";
html_reg_parse($content, $template, $res);
print_r($res);
подскажите, как сделать сохранение в файл сграбленного этой библиотекой?я остановился на проекте
*** скрытое содержание ***
вот к примеру кусок рабочего парсера
Первая строка выдерет только обычнный текст из тегов div class=votersPHP:$ret['Rating'] = $html->find('div[class="voters"] span', 0)->plaintext; $ret['ImageLink'] = $html->find('div[class="maindata"] div img', 0)->outertext; $ret['Ocenka'] = $html->find('table[class="form pros"]', 0)->innertext;
Втроя заберет ссылку на картинку из вложенного div в divе с классом maindata.
а третья выдерет все что внутри таблицы с классом form pros (html)
и т.д.
Очень подвижный парсер, примеров там штук 8 есть. Не халява - мануал почитать придется.но он того стоит.
часа 4 читал доку 5 часов писал...
но теперь и парсер и граббер и 20к страниц + графика у мя в кармане.
<?php
define ("NODE_TYPE_START",0);
define ("NODE_TYPE_ELEMENT",1);
define ("NODE_TYPE_ENDELEMENT",2);
define ("NODE_TYPE_TEXT",3);
define ("NODE_TYPE_COMMENT",4);
define ("NODE_TYPE_DONE",5);
class HtmlParser {
var $iNodeType;
var $iNodeName = "";
var $iNodeValue = "";
var $iNodeAttributes;
var $iHtmlText;
var $iHtmlTextLength;
var $iHtmlTextIndex = 0;
var $iHtmlCurrentChar;
var $BOE_ARRAY;
var $B_ARRAY;
var $BOS_ARRAY;
function HtmlParser ($aHtmlText) {
$this->iHtmlText = $aHtmlText;
$this->iHtmlTextLength = strlen($aHtmlText);
$this->iNodeAttributes = array();
$this->setTextIndex (0);
$this->BOE_ARRAY = array (" ", "\t", "\r", "\n", "=" );
$this->B_ARRAY = array (" ", "\t", "\r", "\n" );
$this->BOS_ARRAY = array (" ", "\t", "\r", "\n", "/" );
}
function parse() {
$text = $this->skipToElement();
if ($text != "") {
$this->iNodeType = NODE_TYPE_TEXT;
$this->iNodeName = "Text";
$this->iNodeValue = $text;
return true;
}
return $this->readTag();
}
function clearAttributes() {
$this->iNodeAttributes = array();
}
function readTag() {
if ($this->iCurrentChar != "<") {
$this->iNodeType = NODE_TYPE_DONE;
return false;
}
$this->clearAttributes();
$this->skipMaxInTag ("<", 1);
if ($this->iCurrentChar == '/') {
$this->moveNext();
$name = $this->skipToBlanksInTag();
$this->iNodeType = NODE_TYPE_ENDELEMENT;
$this->iNodeName = $name;
$this->iNodeValue = "";
$this->skipEndOfTag();
return true;
}
$name = $this->skipToBlanksOrSlashInTag();
if (!$this->isValidTagIdentifier ($name)) {
$comment = false;
if (strpos($name, "!--") === 0) {
$ppos = strpos($name, "--", 3);
if (strpos($name, "--", 3) === (strlen($name) - 2)) {
$this->iNodeType = NODE_TYPE_COMMENT;
$this->iNodeName = "Comment";
$this->iNodeValue = "<" . $name . ">";
$comment = true;
}
else {
$rest = $this->skipToStringInTag ("-->");
if ($rest != "") {
$this->iNodeType = NODE_TYPE_COMMENT;
$this->iNodeName = "Comment";
$this->iNodeValue = "<" . $name . $rest;
$comment = true;
// Already skipped end of tag
return true;
}
}
}
if (!$comment) {
$this->iNodeType = NODE_TYPE_TEXT;
$this->iNodeName = "Text";
$this->iNodeValue = "<" . $name;
return true;
}
}
else {
$this->iNodeType = NODE_TYPE_ELEMENT;
$this->iNodeValue = "";
$this->iNodeName = $name;
while ($this->skipBlanksInTag()) {
$attrName = $this->skipToBlanksOrEqualsInTag();
if ($attrName != "" && $attrName != "/") {
$this->skipBlanksInTag();
if ($this->iCurrentChar == "=") {
$this->skipEqualsInTag();
$this->skipBlanksInTag();
$value = $this->readValueInTag();
$this->iNodeAttributes[strtolower($attrName)] = $value;
}
else {
$this->iNodeAttributes[strtolower($attrName)] = "";
}
}
}
}
$this->skipEndOfTag();
return true;
}
function isValidTagIdentifier ($name) {
return ereg ("^[A-Za-z0-9_\\-]+$", $name);
}
function skipBlanksInTag() {
return "" != ($this->skipInTag ($this->B_ARRAY));
}
function skipToBlanksOrEqualsInTag() {
return $this->skipToInTag ($this->BOE_ARRAY);
}
function skipToBlanksInTag() {
return $this->skipToInTag ($this->B_ARRAY);
}
function skipToBlanksOrSlashInTag() {
return $this->skipToInTag ($this->BOS_ARRAY);
}
function skipEqualsInTag() {
return $this->skipMaxInTag ("=", 1);
}
function readValueInTag() {
$ch = $this->iCurrentChar;
$value = "";
if ($ch == "\"") {
$this->skipMaxInTag ("\"", 1);
$value = $this->skipToInTag ("\"");
$this->skipMaxInTag ("\"", 1);
}
else if ($ch == "'") {
$this->skipMaxInTag ("'", 1);
$value = $this->skipToInTag ("'");
$this->skipMaxInTag ("'", 1);
}
else {
$value = $this->skipToBlanksInTag();
}
return $value;
}
function setTextIndex ($index) {
$this->iHtmlTextIndex = $index;
if ($index >= $this->iHtmlTextLength) {
$this->iCurrentChar = -1;
}
else {
$this->iCurrentChar = $this->iHtmlText{$index};
}
}
function moveNext() {
if ($this->iHtmlTextIndex < $this->iHtmlTextLength) {
$this->setTextIndex ($this->iHtmlTextIndex + 1);
return true;
}
else {
return false;
}
}
function skipEndOfTag() {
while (($ch = $this->iCurrentChar) !== -1) {
if ($ch == ">") {
$this->moveNext();
return;
}
$this->moveNext();
}
}
function skipInTag ($chars) {
$sb = "";
while (($ch = $this->iCurrentChar) !== -1) {
if ($ch == ">") {
return $sb;
} else {
$match = false;
for ($idx = 0; $idx < count($chars); $idx++) {
if ($ch == $chars[$idx]) {
$match = true;
break;
}
}
if (!$match) {
return $sb;
}
$sb .= $ch;
$this->moveNext();
}
}
return $sb;
}
function skipMaxInTag ($chars, $maxChars) {
$sb = "";
$count = 0;
while (($ch = $this->iCurrentChar) !== -1 && $count++ < $maxChars) {
if ($ch == ">") {
return $sb;
} else {
$match = false;
for ($idx = 0; $idx < count($chars); $idx++) {
if ($ch == $chars[$idx]) {
$match = true;
break;
}
}
if (!$match) {
return $sb;
}
$sb .= $ch;
$this->moveNext();
}
}
return $sb;
}
function skipToInTag ($chars) {
$sb = "";
while (($ch = $this->iCurrentChar) !== -1) {
$match = $ch == ">";
if (!$match) {
for ($idx = 0; $idx < count($chars); $idx++) {
if ($ch == $chars[$idx]) {
$match = true;
break;
}
}
}
if ($match) {
return $sb;
}
$sb .= $ch;
$this->moveNext();
}
return $sb;
}
function skipToElement() {
$sb = "";
while (($ch = $this->iCurrentChar) !== -1) {
if ($ch == "<") {
return $sb;
}
$sb .= $ch;
$this->moveNext();
}
return $sb;
}
/**
* Returns text between current position and $needle,
* inclusive, or "" if not found. The current index is moved to a point
* after the location of $needle, or not moved at all
* if nothing is found.
*/
function skipToStringInTag ($needle) {
$pos = strpos ($this->iHtmlText, $needle, $this->iHtmlTextIndex);
if ($pos === false) {
return "";
}
$top = $pos + strlen($needle);
$retvalue = substr ($this->iHtmlText, $this->iHtmlTextIndex, $top - $this->iHtmlTextIndex);
$this->setTextIndex ($top);
return $retvalue;
}
}
function HtmlParser_ForFile ($fileName) {
return HtmlParser_ForURL($fileName);
}
function HtmlParser_ForURL ($url) {
$fp = fopen ($url, "r");
$content = "";
while (true) {
$data = fread ($fp, 8192);
if (strlen($data) == 0) {
break;
}
$content .= $data;
}
fclose ($fp);
return new HtmlParser ($content);
}
php?>