1. 项目地址:

    # 爬虫框架
    https://github.com/gocolly/colly
    # 协程池框架
    https://github.com/panjf2000/ants/blob/master/README_ZH.md
  2. 用法:

     package main
    
     import (
         "fmt"
         "github.com/gocolly/colly/v2"
         "github.com/panjf2000/ants/v2"
         "sync"
     )
    
     func main() {
         var wg sync.WaitGroup
         pool, _ := ants.NewPoolWithFunc(10, func(param interface{}) {
             // todo: 业务逻辑处理部分
             str := param.(string)
             fmt.Println(str)
             wg.Done()
         })
         defer pool.Release()
         url := "https://www.autohome.com.cn/beijing/"
         c := colly.NewCollector() // colly 外层对象
         c.OnRequest(func(r *colly.Request) {
             r.Headers.Set("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36") // 模拟浏览器访问
         })
         c.OnHTML("#homepage-focus-live .focus-middle .ui-list li a", func(e *colly.HTMLElement) {
             link := e.Attr("href")
             url1 := e.Request.AbsoluteURL(link) // 获取绝对路径
             c1 := colly.NewCollector()          // colly 内层对象
             c1.OnHTML(".introduce .introduce_content .article-content", func(e1 *colly.HTMLElement) {
                 e1.DOM.Find("img").Remove() // 移除不需要的标签
                 str, err := e1.DOM.Html()
                 if err == nil {
                     wg.Add(1)
                     err := pool.Invoke(str)
                     if err != nil {
                         fmt.Println("放入协程池失败:" + err.Error())
                     }
                 }
             })
             c1.Visit(url1)
         })
         c.Visit(url)
         wg.Wait()
     }
文档更新时间: 2024-04-18 16:35   作者:lee