Skip to content

第一步:创建项目

bash
cargo new web-crawler
cd web-crawler

Cargo.toml

toml
[package]
name = "web-crawler"
version = "0.1.0"
edition = "2021"

[dependencies]
tokio = { version = "1", features = ["full"] }
reqwest = { version = "0.11", features = ["json"] }
scraper = "0.18"  # HTML 解析
url = "2"
select = "0.6"    # CSS 选择器
serde = { version = "1", features = ["derive"] }
serde_json = "1"
toml = "0.8"
anyhow = "1"
tracing = "0.1"
tracing-subscriber = "0.3"
chrono = { version = "0.4", features = ["serde"] }
futures = "0.3"
async-channel = "2"

第二步:配置模块

config.toml

toml
[crawler]
max_depth = 3
max_concurrent = 10
timeout_seconds = 30
user_agent = "RustCrawler/0.1"

[storage]
output_dir = "./output"
save_html = false
save_text = true

src/config.rs

rust
use serde::Deserialize;

#[derive(Debug, Deserialize)]
pub struct Config {
    pub crawler: CrawlerConfig,
    pub storage: StorageConfig,
}

#[derive(Debug, Deserialize)]
pub struct CrawlerConfig {
    pub max_depth: usize,
    pub max_concurrent: usize,
    pub timeout_seconds: u64,
    pub user_agent: String,
}

#[derive(Debug, Deserialize)]
pub struct StorageConfig {
    pub output_dir: String,
    pub save_html: bool,
    pub save_text: bool,
}

impl Config {
    pub fn load() -> anyhow::Result<Self> {
        let content = std::fs::read_to_string("config.toml")?;
        let config: Config = toml::from_str(&content)?;
        Ok(config)
    }
}
▶ Run

第三步:网页获取

src/crawler/fetcher.rs

rust
use reqwest::Client;
use std::time::Duration;
use crate::config::CrawlerConfig;

/// 网页获取器
pub struct Fetcher {
    client: Client,
}

impl Fetcher {
    /// 创建获取器
    pub fn new(config: &CrawlerConfig) -> Self {
        let client = Client::builder()
            .timeout(Duration::from_secs(config.timeout_seconds))
            .user_agent(&config.user_agent)
            .build()
            .unwrap();
        
        Self { client }
    }
    
    /// 获取网页内容
    pub async fn fetch(&self, url: &str) -> anyhow::Result<String> {
        let response = self.client
            .get(url)
            .send()
            .await?;
        
        let html = response.text().await?;
        Ok(html)
    }
    
    /// 获取网页并返回响应信息
    pub async fn fetch_with_info(&self, url: &str) -> anyhow::Result<FetchResult> {
        let response = self.client
            .get(url)
            .send()
            .await?;
        
        let status = response.status().as_u16();
        let headers = response.headers().clone();
        let html = response.text().await?;
        
        Ok(FetchResult {
            url: url.to_string(),
            status,
            headers,
            html,
        })
    }
}

/// 获取结果
pub struct FetchResult {
    pub url: String,
    pub status: u16,
    pub headers: reqwest::header::HeaderMap,
    pub html: String,
}
▶ Run

第四步:HTML 解析

src/crawler/parser.rs

rust
use scraper::{Html, Selector};

/// HTML 解析器
pub struct HtmlParser {
    document: Html,
}

impl HtmlParser {
    /// 从 HTML 字符串创建
    pub fn from_html(html: &str) -> Self {
        let document = Html::parse_document(html);
        Self { document }
    }
    
    /// 提取标题
    pub fn title(&self) -> Option<String> {
        let selector = Selector::parse("title").unwrap();
        self.document
            .select(&selector)
            .next()
            .map(|el| el.text().collect::<String>())
    }
    
    /// 提取所有链接
    pub fn links(&self, base_url: &str) -> Vec<String> {
        let selector = Selector::parse("a[href]").unwrap();
        
        self.document
            .select(&selector)
            .filter_map(|el| {
                let href = el.value().attr("href")?;
                
                // 解析相对 URL
                let url = url::Url::parse(base_url)
                    .ok()?
                    .join(href)
                    .ok()?;
                
                // 只保留 HTTP/HTTPS 链接
                if url.scheme() == "http" || url.scheme() == "https" {
                    Some(url.to_string())
                } else {
                    None
                }
            })
            .collect()
    }
    
    /// 提取文本内容
    pub fn text(&self) -> String {
        let selector = Selector::parse("body").unwrap();
        
        self.document
            .select(&selector)
            .next()
            .map(|el| {
                el.text()
                    .collect::<Vec<_>>()
                    .join(" ")
                    .split_whitespace()
                    .collect::<Vec<_>>()
                    .join(" ")
            })
            .unwrap_or_default()
    }
    
    /// 提取所有图片
    pub fn images(&self, base_url: &str) -> Vec<String> {
        let selector = Selector::parse("img[src]").unwrap();
        
        self.document
            .select(&selector)
            .filter_map(|el| {
                let src = el.value().attr("src")?;
                
                url::Url::parse(base_url)
                    .ok()?
                    .join(src)
                    .ok()
                    .map(|u| u.to_string())
            })
            .collect()
    }
    
    /// 使用 CSS 选择器提取元素
    pub fn select(&self, selector: &str) -> Vec<String> {
        let sel = Selector::parse(selector).unwrap();
        
        self.document
            .select(&sel)
            .map(|el| el.text().collect::<String>())
            .collect()
    }
    
    /// 提取 meta 信息
    pub fn meta(&self) -> std::collections::HashMap<String, String> {
        let selector = Selector::parse("meta[name][content]").unwrap();
        
        self.document
            .select(&selector)
            .filter_map(|el| {
                let name = el.value().attr("name")?;
                let content = el.value().attr("content")?;
                Some((name.to_string(), content.to_string()))
            })
            .collect()
    }
}
▶ Run

第五步:URL 队列

src/crawler/queue.rs

rust
use std::collections::{HashSet, VecDeque};
use std::sync::Arc;
use tokio::sync::Mutex;

/// URL 队列
pub struct UrlQueue {
    /// 待处理队列
    pending: Arc<Mutex<VecDeque<String>>>,
    /// 已处理集合
    visited: Arc<Mutex<HashSet<String>>>,
}

impl UrlQueue {
    /// 创建新队列
    pub fn new() -> Self {
        Self {
            pending: Arc::new(Mutex::new(VecDeque::new())),
            visited: Arc::new(Mutex::new(HashSet::new())),
        }
    }
    
    /// 添加 URL
    pub async fn push(&self, url: String) {
        let visited = self.visited.lock().await;
        
        if !visited.contains(&url) {
            let mut pending = self.pending.lock().await;
            pending.push_back(url);
        }
    }
    
    /// 批量添加 URL
    pub async fn push_batch(&self, urls: Vec<String>) {
        for url in urls {
            self.push(url).await;
        }
    }
    
    /// 获取下一个 URL
    pub async fn pop(&self) -> Option<String> {
        let mut pending = self.pending.lock().await;
        
        if let Some(url) = pending.pop_front() {
            let mut visited = self.visited.lock().await;
            visited.insert(url.clone());
            Some(url)
        } else {
            None
        }
    }
    
    /// 检查是否为空
    pub async fn is_empty(&self) -> bool {
        let pending = self.pending.lock().await;
        pending.is_empty()
    }
    
    /// 获取队列大小
    pub async fn size(&self) -> usize {
        let pending = self.pending.lock().await;
        pending.len()
    }
    
    /// 获取已处理数量
    pub async fn visited_count(&self) -> usize {
        let visited = self.visited.lock().await;
        visited.len()
    }
}

impl Default for UrlQueue {
    fn default() -> Self {
        Self::new()
    }
}
▶ Run