项目地址:
# 爬虫框架 https://github.com/gocolly/colly # 协程池框架 https://github.com/panjf2000/ants/blob/master/README_ZH.md
用法:
package main import ( "fmt" "github.com/gocolly/colly/v2" "github.com/panjf2000/ants/v2" "sync" ) func main() { var wg sync.WaitGroup pool, _ := ants.NewPoolWithFunc(10, func(param interface{}) { // todo: 业务逻辑处理部分 str := param.(string) fmt.Println(str) wg.Done() }) defer pool.Release() url := "https://www.autohome.com.cn/beijing/" c := colly.NewCollector() // colly 外层对象 c.OnRequest(func(r *colly.Request) { r.Headers.Set("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36") // 模拟浏览器访问 }) c.OnHTML("#homepage-focus-live .focus-middle .ui-list li a", func(e *colly.HTMLElement) { link := e.Attr("href") url1 := e.Request.AbsoluteURL(link) // 获取绝对路径 c1 := colly.NewCollector() // colly 内层对象 c1.OnHTML(".introduce .introduce_content .article-content", func(e1 *colly.HTMLElement) { e1.DOM.Find("img").Remove() // 移除不需要的标签 str, err := e1.DOM.Html() if err == nil { wg.Add(1) err := pool.Invoke(str) if err != nil { fmt.Println("放入协程池失败:" + err.Error()) } } }) c1.Visit(url1) }) c.Visit(url) wg.Wait() }
文档更新时间: 2024-04-18 16:35 作者:lee