<?php

// Copyright (C) 2005 Ilya S. Lyubinskiy. All rights reserved.
// Technical support: http://www.php-development.ru/
//
// YOU MAY NOT
// (1) Remove or modify this copyright notice.
// (2) Distribute this code, any part or any modified version of it.
//     Instead, you may link to the homepage of this code:
//     http://www.php-development.ru/javascripts/smart-forms.php.
//
// YOU MAY
// (1) Use this code or any modified version of it on your website.
// (2) Use this code as part of another product.
//
// NO WARRANTY
// This code is provided "as is" without warranty of any kind, either
// expressed or implied, including, but not limited to, the implied warranties
// of merchantability and fitness for a particular purpose. You expressly
// acknowledge and agree that use of this code is at your own risk.


set_time_limit(0);
error_reporting(E_ALL);
ini_set("log_errors", 0);
ini_set("display_errors", 1);


// ----- Parse HTML ------------------------------------------------------------

function enclose($start, $end1, $end2)
{
  return "$start((?:[^$end1]|$end1(?!$end2))*)$end1$end2";
}

function parse($html, &$title, &$text, &$anchors)
{
  $pstring1 = "'[^']*'";
  $pstring2 = '"[^"]*"';
  $pnstring = "[^'\">]";
  $pintag   = "(?:$pstring1|$pstring2|$pnstring)*";
  $pattrs   = "(?:\\s$pintag){0,1}";

  $pcomment = enclose("<!--", "-", "->");
  $pscript  = enclose("<script$pattrs>", "<", "\\/script>");
  $pstyle   = enclose("<style$pattrs>", "<", "\\/style>");
  $pexclude = "(?:$pcomment|$pscript|$pstyle)";

  $ptitle   = enclose("<title$pattrs>", "<", "\\/title>");
  $panchor  = "<a(?:\\s$pintag){0,1}>";
  $phref    = "href\\s*=[\\s'\"]*([^\\s'\">]*)";

  $html = preg_replace("/$pexclude/iX", " ", $html);

  if ($title !== false)
    $title = preg_match("/$ptitle/iX", $html, $title) ? $title[1] : '';

  if ($text !== false)
  {
    $text = preg_replace("/<$pintag>/iX",   " ", $html);
    $text = preg_replace("/\\s+|&nbsp;/iX", " ", $text);
  }

  if ($anchors !== false)
  {
    preg_match_all("/$panchor/iX", $html, $anchors);
    $anchors = $anchors[0];

    reset($anchors);
    while (list($i, $x) = each($anchors))
      $anchors[$i] = preg_match("/$phref/iX", $x, $x) ? $x[1] : '';

    $anchors = array_unique($anchors);
  }
}


// ----- URL Functions ---------------------------------------------------------


// ----- Parse URL -----

function url_parse($url)
{
  $error_reporting = error_reporting(E_ERROR | E_PARSE);
  $url = parse_url($url);
  error_reporting($error_reporting);
  return $url;
}

// ----- Extract Scheme -----

function url_scheme($url, $scheme = 'http')
{
  if(!($url = url_parse($url))) return $scheme;
  return isset($url['scheme']) ? $url['scheme'] : $scheme;
}

// ----- Extract Host -----

define('URL_HOST_APPEND', 1);
define('URL_HOST_STRIP',  2);

function url_host($url, $lower = true, $www = 0)
{
  if(!($url = url_parse($url))) return '';
  $url = $lower ? strtolower($url['host']) : $url['host'];
  if ($www == URL_HOST_APPEND && strpos($url, 'www.') !== 0) return 'www.' . $url;
  if ($www == URL_HOST_STRIP  && strpos($url, 'www.') === 0) return substr($url, 4);
  return $url;
}

// ----- Extract Path -----

function url_path($url)
{
  if(!($url = url_parse($url))) return '';
  $url = isset($url['path']) ? explode('/', $url['path']) : Array();
  if (reset($url) === '') array_shift($url);
  if (end  ($url) === '' || strpos(end($url), '.') !== false) array_pop($url);
  return implode('/', $url);
}

// ----- Extract Filename -----

function url_file($url, $convert = Array())
{
  if(!($url = url_parse($url))) return '';
  $url = isset($url['path']) ? end(explode('/', $url['path'])) : '';
  $url = (strpos($url, '.') !== false) ? $url : '';
  foreach ($convert as $i => $x) $url = preg_replace($i, $x, $url);
  return $url;
}


// ----- Extract Extension -----

function url_ext($url, $convert = Array())
{
  if(!($url = url_parse($url))) return '';
  $url = isset($url['path']) ? end(explode('/', $url['path'])) : '';
  $url = (strpos($url, '.') !== false) ? end(explode('.', $url)) : '';
  foreach ($convert as $i => $x) $url = preg_replace($i, $x, $url);
  return $url;
}

// ----- Extract Query -----

define('URL_QUERY_NOESCAPE', 0);
define('URL_QUERY_ESCAPE'  , 1);

function url_query($url, $escape = 0, $exclude = Array())
{
  if(!($url = url_parse($url))) return '';
  if (!isset($url['query'])) return '';
  $url = preg_split('/(&(?!amp;)|&amp;)/', $url['query']);

  foreach ($url as $i => $x)
  {
    $x = explode('=', $x);
    if (in_array($x[0], $exclude)) unset($url[$i]);
  }

  return implode($escape ? '&amp;' : '&', $url);
}

// ----- Concat -----

function url_concat($base, $rel)
{
  $scheme = url_scheme($base);
  $host   = url_host  ($base);
  $path   = url_path  ($base);

  if ($rel{0} == '/')
       return "$scheme://$host$rel";
  else if ($path === '')
            return "$scheme://$host/$rel";
       else return "$scheme://$host/$path/$rel";
}

// ----- Normalize -----

function url_normalize($url,
                       $scheme  = 'http',
                       $www     = 0,
                       $convert = Array(),
                       $escape  = 0,
                       $exclude = Array())
{
  $scheme = url_scheme($url, $scheme);
  $host   = url_host  ($url, true, $www);
  $path   = url_path  ($url);
  $file   = url_file  ($url, $convert);
  $query  = url_query ($url, $escape, $exclude);

  if ($scheme === '' || $host === '') return '';

  if ($path === '')
       return "$scheme://$host/$file"       . ($query ? "?$query" : "");
  else return "$scheme://$host/$path/$file" . ($query ? "?$query" : "");
}


// ----- Index Website ---------------------------------------------------------

// INPUT:
//
// $roots      - The function will parse only those URLs that start with
//               a string from $roots array.
// $urls       - Array containing URLs from which to start indexing.
// $max        - Maximum number of pages to be indexed.
// $www        - URL_HOST_APPEND = append "www.", URL_HOST_STRIP = strip "www.".
// $convert    - Array of file conversions.
// $exclude    - Array of names to be excluded from query.
// $titles     - Empty array if titles are needed, false otherwise.
// $text       - Empty array if texts  are needed, false otherwise.
// $extensions - Array of webpage extensions.
//
// OUTPUT:
//
// $urls    - Array of indexed URLs.
// $titles  - Array of titles or False.
// $text    - Array of texts  or False.
//
// SAMPLE CALL
//
// $result = index($roots = Array('http://domain.com/'),
//                 $urls  = Array('http://domain.com/'),
//                 1024,
//                 INDEX_HOST_STRIP,
//                 Array('/^index.\\w+$/' => ''),
//                 Array('id'),
//                 $titles = Array(),
//                 $texts  = Array());
//
// Index only URLs from domain "domain.com".
// Start indexing from URL "http://domain.com/".
// Index up to 1024 webpages.
// Strip "www." from domain names.
// Remove "index.*" from URLs.
// Remove "id" key from queries.
// After execution:
// $roots  contains array of indexed URLs.
// $titles contains array of titles (NOT for all indexed URLs).
// $texts  contains array of texts  (NOT for all indexed URLs).


define('INDEX_HOST_APPEND', 1);
define('INDEX_HOST_STRIP',  2);

function index($roots, &$urls, $max, $www, $convert, $exclude,
               &$titles, &$texts, $ext_parse, $extensions)
{
  $time   = microtime(true);
  $parsed = 0;

  foreach ($urls as $i => $url)
    $urls[$i] = url_normalize($url, 'http', $www, $convert, URL_QUERY_NOESCAPE, $exclude);

  for ($ind = 0; $ind < count($urls); $ind++)
  {
    if (trim($urls[$ind]) === '')
    {
      unset($urls[$ind]);
      continue;
    }

    // ----- Check URL -----

    $in_root = false;
    foreach ($roots as $i => $root)
      $in_root = $in_root || strpos($urls[$ind], $root) === 0;

    if (!$in_root)
    {
      if (!$ext_parse) continue;
      if ($titles === false && $texts === false) continue;
    }

    if (!in_array(url_ext($urls[$ind]), $extensions)) continue;

    // ----- Get Contents -----

    $error_reporting = error_reporting(E_ERROR | E_PARSE);
    $html = file_get_contents($urls[$ind]);
    error_reporting($error_reporting);

    if ($html === false) continue;

    // ----- Parse URL -----

    $parsed++;

    $title = $titles !== false;
    $text  = $texts  !== false;
    parse($html, $title, $text, $anchors);

    if ($titles !== false) $titles[$ind] = $title;
    if ($texts  !== false) $texts [$ind] = $text;

    // ----- Extract Anchors -----

    if (!$in_root || $max < count($urls)) continue;

    foreach ($anchors as $i => $x)
    {
      $x = preg_replace("/#.*/X", "", $x);
      if ($x == '' || preg_match("/^(\\w)+:(?!\/\/)/X", $x)) continue;
      if (!preg_match("/^(\\w)+:\/\//X", $x)) $x = url_concat($urls[$ind], $x);
      $x = url_normalize($x, 'http', $www, $convert, URL_QUERY_NOESCAPE, $exclude);
      if (!in_array($x, $urls) && (count($urls) < $max)) $urls[] = $x;
    }
  }

  return Array("time" => microtime(true)-$time, "parsed" => $parsed);
}


// ----- Separate Links --------------------------------------------------------

// INPUT:
//
// $roots      - URL is considered internal if it starts with a string from
//               $roots array.
// $urls       - Array containing URLs.
// $extensions - Array of webpage extensions.
//
// OUTPUT:
//
// $int_pages - Array of internal webpages
// $int_loads - Array of external downloads
// $ext_pages - Array of internal webpages
// $ext_loads - Array of external downloads

function separate($roots, $urls,
                  &$int_pages, &$int_loads, &$ext_pages, &$ext_loads,
                  $extensions)
{
  foreach ($urls as $i => $url)
  {
    if (trim($url) === '') continue;

    $in_root = false;
    foreach ($roots as $j => $root)
      $in_root = $in_root || strpos($url, $root) === 0;

    if ($in_root)
    {
      if (in_array(url_ext($url), $extensions))
           $int_pages[$i] = $url;
      else $int_loads[$i] = $url;
    }
    else
    {
      if (in_array(url_ext($url), $extensions))
           $ext_pages[$i] = $url;
      else $ext_loads[$i] = $url;
    }
  }
}


// ----- Use It ----------------------------------------------------------------

if (url_host($_POST['url']) ==     'php-development.ru' ||
    url_host($_POST['url']) == 'www.php-development.ru')
  unset($_POST['external']);

$roots    = Array($_POST['url'],
                  url_normalize($_POST['url'], 'http', URL_HOST_APPEND),
                  url_normalize($_POST['url'], 'http', URL_HOST_STRIP));
$urls     = Array($_POST['url']);

$maxpages = (integer)$_POST['maxpages'];
if (($_SERVER['SERVER_NAME'] == 'localhost') ||
    ($_SERVER['SERVER_NAME'] == 'php-development.ru') ||
    ($_SERVER['SERVER_NAME'] == 'www.php-development.ru'))
  if ($maxpages > 32) $maxpages = 32;

$host     = 0;
if ($_POST['www'] == "strip" ) $host = INDEX_HOST_STRIP;
if ($_POST['www'] == "append") $host = INDEX_HOST_APPEND;

$index    = Array();
if ($_POST['index'] == "strip" ) $index = Array('/^index.\\w+$/' => '');
if ($_POST['index'] == "append") $index = Array('/^$/' => $_POST['index_append']);

$exclude  = preg_split('/\\s+/', $_POST['ses']);

$titles   = Array();
$texts    = false;

$external = isset($_POST['external']);

$ext      = preg_split('/\\s+/', $_POST['ext']);
$ext[]    = '';

$result  = index($roots, $urls, $maxpages, $host, $index, $exclude, $titles, $texts, $external, $ext);

$int_pages = Array();
$int_loads = Array();
$ext_pages = Array();
$ext_loads = Array();
separate($roots, $urls, $int_pages, $int_loads, $ext_pages, $ext_loads, $ext);

?>

<?php
// ----- HTML -----
?>

<!DOCTYPE html PUBLIC
          "-//W3C//DTD XHTML 1.0 Transitional//EN"
          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html>
<head>

<title>Website Index</title>

<style type="text/css">

table { border-collapse: collapse; }

td    { padding: 0px 5px 1px 0px; }

div.link
{
  padding: 0px;
  margin: 0px 0px 6px 0px;
  line-height: 0.9em;
}

a
{
  text-decoration: none;
  color: #007500;
}

</style>

</head>
<body>

<h3>Site Index</h3>

<table>

<tr>
  <td>Root&nbsp;location:</td>
  <td><?=$_POST['url'];?></td>
</tr>
<tr>
  <td>Execution&nbsp;time:</td>
  <td><?=number_format($result['time'], 2, '.', '');?> secs</td>
</tr>
<tr>
  <td>URLs&nbsp;parsed:</td>
  <td><?=$result['parsed'];?></td>
</tr>
<tr>
  <td>URLs&nbsp;extracted:</td>
  <td><?=count($urls);?></td>
</tr>
</table>

<?php
// ----- Internal Webpages -----
?>

<h4>Internal Webpages</h4>

<?php

asort($int_pages);
foreach ($int_pages as $i => $x)
  $int_pages[$i] = "<div class=\"link\">" .
                   ((isset($titles[$i]) && trim($titles[$i]) !== '') ? $titles[$i] : "Untitled Document") .
                   "<br />" .
                   "<small><a href=\"$x\">" . htmlentities($x) . "</a></small>" .
                   "</div>";
echo implode('', $int_pages);

?>

<?php
// ----- Internal Downloads -----
?>

<h4>Internal Downloads</h4>

<?php

asort($int_loads);
foreach ($int_loads as $i => $x)
  $int_loads[$i] = "<div class=\"link\">" .
                  url_file($x) .
                  "<br />" .
                  "<small><a href=\"$x\">" . htmlentities($x) . "</a></small>" .
                  "</div>";
echo implode('', $int_loads);

?>

<?php
// ----- Internal Webpages -----
?>

<h4>External Webpages</h4>

<?php

asort($ext_pages);
foreach ($ext_pages as $i => $x)
  $ext_pages[$i] = "<div class=\"link\">" .
                   ((isset($titles[$i]) && trim($titles[$i]) !== '') ? $titles[$i] : "Untitled Document") .
                   "<br />" .
                   "<small><a href=\"$x\">" . htmlentities($x) . "</a></small>" .
                   "</div>";
echo implode('', $ext_pages);

?>

<?php
// ----- Internal Downloads -----
?>

<h4>External Downloads</h4>

<?php

asort($ext_loads);
foreach ($ext_loads as $i => $x)
  $ext_loads[$i] = "<div class=\"link\">" .
                   url_file($x) .
                   "<br />" .
                   "<small><a href=\"$x\">" . htmlentities($x) . "</a></small>" .
                   "</div>";
echo implode('', $ext_loads);

?>

</body>
</html>
