方法一:使用 cURL(推荐,功能最强大)
<?php
/**
* 使用cURL抓取完整网页
* @param string $url 目标网址
* @param array $options 可选配置
* @return array 返回抓取结果
*/
function fetchWebpageWithCurl($url, $options = []) {
// 初始化cURL
$ch = curl_init();
// 默认配置
$defaultOptions = [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true, // 返回内容而不是直接输出
CURLOPT_FOLLOWLOCATION => true, // 跟随重定向
CURLOPT_MAXREDIRS => 5, // 最大重定向次数
CURLOPT_TIMEOUT => 30, // 超时时间(秒)
CURLOPT_SSL_VERIFYPEER => false, // 跳过SSL验证(仅测试用,生产环境建议开启)
CURLOPT_SSL_VERIFYHOST => false, // 跳过SSL主机验证
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
CURLOPT_HEADER => true, // 返回header信息
CURLOPT_ENCODING => '', // 接受所有编码
];
// 合并自定义选项
$finalOptions = $defaultOptions;
if (!empty($options)) {
foreach ($options as $key => $value) {
$finalOptions[$key] = $value;
}
}
// 设置cURL选项
curl_setopt_array($ch, $finalOptions);
// 执行请求
$response = curl_exec($ch);
// 检查是否有错误
if (curl_error($ch)) {
$error = curl_error($ch);
curl_close($ch);
return [
'success' => false,
'error' => $error
];
}
// 获取HTTP状态码
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
// 分离header和body
$headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$header = substr($response, 0, $headerSize);
$body = substr($response, $headerSize);
curl_close($ch);
return [
'success' => true,
'http_code' => $httpCode,
'header' => $header,
'body' => $body,
'url' => $url
];
}
// 使用示例
$url = 'https://example.com';
$result = fetchWebpageWithCurl($url);
if ($result['success']) {
echo "抓取成功!\n";
echo "HTTP状态码:" . $result['http_code'] . "\n";
echo "网页内容长度:" . strlen($result['body']) . " 字节\n";
echo "网页内容预览:\n" . substr($result['body'], 0, 500) . "...\n";
// 保存到文件
file_put_contents('downloaded_page.html', $result['body']);
echo "网页已保存到 downloaded_page.html\n";
} else {
echo "抓取失败:" . $result['error'] . "\n";
}
?>方法二:使用 file_get_contents(简单但功能有限)
<?php
/**
* 使用file_get_contents抓取网页
* @param string $url 目标网址
* @return array 返回抓取结果
*/
function fetchWebpageWithFileGetContents($url) {
// 创建上下文配置
$options = [
'http' => [
'method' => 'GET',
'header' => [
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding: gzip, deflate',
'Connection: keep-alive',
],
'timeout' => 30,
'ignore_errors' => true
],
'ssl' => [
'verify_peer' => false,
'verify_peer_name' => false
]
];
$context = stream_context_create($options);
// 尝试抓取网页
try {
// 获取响应头信息
$headers = get_headers($url, 1);
$httpCode = intval(substr($headers[0], 9, 3));
// 获取网页内容
$content = file_get_contents($url, false, $context);
if ($content === false) {
return [
'success' => false,
'error' => '无法获取网页内容'
];
}
return [
'success' => true,
'http_code' => $httpCode,
'headers' => $headers,
'body' => $content,
'url' => $url
];
} catch (Exception $e) {
return [
'success' => false,
'error' => $e->getMessage()
];
}
}
// 使用示例
$url = 'https://example.com';
$result = fetchWebpageWithFileGetContents($url);
if ($result['success']) {
echo "抓取成功!\n";
echo "HTTP状态码:" . $result['http_code'] . "\n";
// 保存到文件
file_put_contents('downloaded_page_fg.html', $result['body']);
echo "网页已保存到 downloaded_page_fg.html\n";
} else {
echo "抓取失败:" . $result['error'] . "\n";
}
?>方法三:增强版cURL抓取(支持更多功能)
<?php
/**
* 增强版网页抓取器
*/
class WebpageFetcher {
private $ch;
private $cookies = [];
private $headers = [];
/**
* 构造函数
*/
public function __construct() {
$this->ch = curl_init();
$this->setDefaultOptions();
}
/**
* 设置默认选项
*/
private function setDefaultOptions() {
$defaultOptions = [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 30,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
CURLOPT_HEADER => true,
CURLOPT_ENCODING => '',
CURLOPT_VERBOSE => false,
];
curl_setopt_array($this->ch, $defaultOptions);
}
/**
* 设置自定义头信息
*/
public function setHeaders($headers) {
$this->headers = array_merge($this->headers, $headers);
curl_setopt($this->ch, CURLOPT_HTTPHEADER, $this->headers);
return $this;
}
/**
* 设置Cookie
*/
public function setCookies($cookies) {
$this->cookies = array_merge($this->cookies, $cookies);
$cookieString = '';
foreach ($this->cookies as $key => $value) {
$cookieString .= "$key=$value; ";
}
curl_setopt($this->ch, CURLOPT_COOKIE, $cookieString);
return $this;
}
/**
* 设置代理
*/
public function setProxy($proxy, $proxyType = CURLPROXY_HTTP) {
curl_setopt($this->ch, CURLOPT_PROXY, $proxy);
curl_setopt($this->ch, CURLOPT_PROXYTYPE, $proxyType);
return $this;
}
/**
* 设置认证信息
*/
public function setAuth($username, $password, $type = CURLAUTH_BASIC) {
curl_setopt($this->ch, CURLOPT_HTTPAUTH, $type);
curl_setopt($this->ch, CURLOPT_USERPWD, "$username:$password");
return $this;
}
/**
* 抓取网页
*/
public function fetch($url) {
curl_setopt($this->ch, CURLOPT_URL, $url);
$response = curl_exec($this->ch);
if (curl_error($this->ch)) {
return [
'success' => false,
'error' => curl_error($this->ch)
];
}
$httpCode = curl_getinfo($this->ch, CURLINFO_HTTP_CODE);
$headerSize = curl_getinfo($this->ch, CURLINFO_HEADER_SIZE);
$header = substr($response, 0, $headerSize);
$body = substr($response, $headerSize);
// 解析响应头
$headers = $this->parseHeaders($header);
return [
'success' => true,
'http_code' => $httpCode,
'headers' => $headers,
'body' => $body,
'url' => $url,
'info' => curl_getinfo($this->ch)
];
}
/**
* 解析响应头
*/
private function parseHeaders($headerString) {
$headers = [];
$lines = explode("\n", trim($headerString));
foreach ($lines as $line) {
if (strpos($line, ':') !== false) {
list($key, $value) = explode(':', $line, 2);
$headers[trim($key)] = trim($value);
}
}
return $headers;
}
/**
* 析构函数
*/
public function __destruct() {
if ($this->ch) {
curl_close($this->ch);
}
}
/**
* 静态方法:快速抓取
*/
public static function quickFetch($url) {
$fetcher = new self();
return $fetcher->fetch($url);
}
}
// 使用示例
$fetcher = new WebpageFetcher();
// 设置自定义选项
$fetcher->setHeaders([
'Accept-Language: zh-CN,zh;q=0.9',
'Cache-Control: no-cache'
])->setCookies([
'session_id' => '123456',
'user_pref' => 'dark_mode'
]);
// 抓取网页
$result = $fetcher->fetch('https://example.com');
if ($result['success']) {
echo "抓取成功!\n";
echo "HTTP状态码:" . $result['http_code'] . "\n";
echo "Content-Type: " . ($result['headers']['Content-Type'] ?? '未知') . "\n";
echo "网页大小:" . strlen($result['body']) . " 字节\n";
// 保存到文件
$filename = 'webpage_' . date('Ymd_His') . '.html';
file_put_contents($filename, $result['body']);
echo "网页已保存到: $filename\n";
// 提取网页标题
if (preg_match('/<title>(.*?)<\/title>/i', $result['body'], $matches)) {
echo "网页标题: " . $matches[1] . "\n";
}
} else {
echo "抓取失败:" . $result['error'] . "\n";
}
// 使用静态方法快速抓取
$quickResult = WebpageFetcher::quickFetch('https://example.com');
?>使用说明
选择合适的方案:
- 如果需要处理复杂的网页、需要设置代理、处理Cookie等,使用方法三(增强版)
- 如果只需要简单抓取,使用方法一(cURL)
- 如果环境不支持cURL,可以使用方法二(file_get_contents)
注意事项:
- 确保PHP开启了相应的扩展(cURL、openssl等)
- 遵守网站的robots.txt规则
- 不要过于频繁地抓取同一网站
- 注意处理编码问题
- 生产环境建议开启SSL验证
常见问题处理:
- 如果遇到编码问题,可以使用
mb_convert_encoding()进行转换 - 如果需要处理JavaScript渲染的页面,可能需要使用无头浏览器(如Puppeteer)
- 对于大文件下载,建议使用流式处理而不是一次性加载到内存
- 如果遇到编码问题,可以使用
The separation of the quickFetch static method from the full class shows good API design. Simple use cases don't need to learn the whole API, but power users have all the flexibility they need.
要是能加上一个简单的缓存机制就更好了。可以设置缓存时间,相同的url在规定时间内直接从缓存读,减少重复请求。对开发测试特别有用。
Great job on the documentation. The Chinese comments help local developers, the English function names make it accessible internationally, and the usage examples at the end tie everything together.
测试了几个奇葩网站,包括需要认证的、返回gzip的、有中文乱码的,都能正常处理。代码健壮性很好,基本上涵盖了90%的常见场景。
The fluent interface design pattern is perfect for configuration-heavy classes like this. Being able to chain methods makes the client code much cleaner than passing a huge array of options.