时时商务社区

标题: php获取网页标题和内容函数(不包含html标签) [打印本页]

作者: qz234 时间: 2018-2-14 06:00

         [U]复制代码[/U] 代码如下:
function getPageContent($url) {
      //$url='http://www.ttphp.com;
      $pageinfo = array();
      $pageinfo[content_type] = '';
      $pageinfo[charset] = '';
      $pageinfo[title] = '';
      $pageinfo[description] = '';
      $pageinfo[keywords] = '';
      $pageinfo[body] = '';
      $pageinfo['httpcode'] = 200;
      $pageinfo['all'] = '';
      $ch = curl_init();
      curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
      curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
      curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
      curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,0);
      curl_setopt($ch, CURLOPT_TIMEOUT, 8);
      curl_setopt($ch, CURLOPT_FILETIME, 1);
      curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
      //curl_setopt($ch, CURLOPT_HEADER, 1);
      curl_setopt($ch, CURLOPT_URL,$url);
      $curl_start = microtime(true);
      $store = curl_exec ($ch);
      $curl_time = microtime(true) - $curl_start;
      if( curl_error($ch) ) {
         $pageinfo['httpcode'] = 505;  //gate way error
         echo 'Curl error: ' . curl_error($ch) ."/n";
         return $pageinfo;
      }
      //print_r(curl_getinfo($ch));
      $pageinfo['httpcode'] = curl_getinfo($ch,CURLINFO_HTTP_CODE);
      //echo curl_getinfo($ch,CURLINFO_CONTENT_TYPE)."/n";
      $pageinfo[content_type] = curl_getinfo($ch,CURLINFO_CONTENT_TYPE);
      if(intval($pageinfo['httpcode'])  200 or !preg_match('@text/html@',curl_getinfo($ch,CURLINFO_CONTENT_TYPE) ) ) {
            //print_r(curl_getinfo($ch) );
            //exit;
            return $pageinfo;
      }
      preg_match('/charset=([^/s/n/r]+)/i',curl_getinfo($ch,CURLINFO_CONTENT_TYPE),$matches); //从header 里取charset
      if( trim($matches[1]) ) {
         $pageinfo[charset] = trim($matches[1]);
      }
      //echo $pageinfo[charset];
      //exit;
      curl_close ($ch);
      //echo $store;

      //remove javascript
      $store = preg_replace("//smUi",'',$store);
      //remove link
      $store = preg_replace("/]+>/smUi",'',$store);
      //remove
      $store = preg_replace("//smUi",'',$store);
      //remove
      $store = preg_replace("/(.*)/smUi",'',$store);
      //remove 中文空格
      $store = preg_replace("/　/",'',$store);
      //remove 标点符号
      //$store = preg_replace("/[/~`!@#$%^&*()_/-+={}|/[/]//;':"//?/,/.//]/",'',$store);

      //preg_match("/(.*)/smUi",$store, $matches);
      //$head = $matches[1];
      //echo $head. "/n";
      //charset
      if($pageinfo[charset] == '' ) {
         preg_match('@]*>@i',$store,$matches);
         $pageinfo[charset] = trim($matches[1]);
      }
      //desctiption
      preg_match('@]+)/*>@i',$store,$matches);
      //print_r($matches);
      $desc = trim($matches[1]);
      $pageinfo[description] = str_replace("/"", '',$desc);

      preg_match('@]+)/*>@i',$store,$matches);
      //print_r($matches);
      $keywords = trim($matches[1]);
      $pageinfo[keywords] = str_replace("/"", '',$keywords);

      preg_match("/(.*)/smUi",$store, $matches);
      $pageinfo[title] = trim($matches[1]);
      preg_match("/(.*)/smUi",$store, $matches);
      $pageinfo[body] = addslashes( replaceHtmlAndJs($matches[1]) ) ;
      $pageinfo['all'] = addslashes( replaceHtmlAndJs($store) ) ;
      //echo "charset = " . $pageinfo[charset] . "/n";
      //print_r($pageinfo);
      //exit;

      return $pageinfo;
}
/**
* 去掉所有的HTML标记和JavaScript标记
*/
function replaceHtmlAndJs($document)
{
      $document = trim($document);
      if (strlen($document) ]*?>.*?
// -->'si",  // 去掉 javascript
                        "']*?>'si",       // 去掉 HTML 标记
                        "'[/r/n/s+]'",             // 去掉空白字符
                        "'&(/w+);'i"             // 替换 HTML 实体
                     );                   // 作为 PHP 代码运行
      $replace = array ( "", "", "", ""  );
      return @preg_replace ($search, $replace, $document);
}
使用例子
[U]复制代码[/U] 代码如下:
$a = getPageContent(www.ttphp.com);
print_r($a);


您可能感兴趣的文章:
PHP html标签正则替换并可自定义正则规则
PHP 修复未正常关闭的HTML标签实现代码(支持嵌套和就近闭合)
PHP实现HTML标签自动补全代码
php使HTML标签自动补全闭合函数代码
php中将html中的br换行符转换为文本输入中的换行符
PHP删除HTMl标签的三种解决方法
php去除HTML标签实例
php 批量替换html标签的实例代码
PHP中HTML标签过滤技巧
php过滤HTML标签、属性等正则表达式汇总
php实现过滤表单提交中html标签的方法
PHP将HTML转换成文本的实现代码
PHP转换文本框内容为HTML格式的方法
php自定义函数转换html标签示例


欢迎光临时时商务社区 (http://bbs.4435.cn/) Powered by Discuz! X3.2