-
Notifications
You must be signed in to change notification settings - Fork 179
/
Copy pathmiyabaobei.go
141 lines (125 loc) · 3.73 KB
/
miyabaobei.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
package spider_lib
// 基础包
import (
"github.com/PuerkitoBio/goquery" //DOM解析
"github.com/henrylee2cn/pholcus/app/downloader/context" //必需
// . "github.com/henrylee2cn/pholcus/reporter" //信息输出
. "github.com/henrylee2cn/pholcus/app/spider" //必需
// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
// net包
// "net/http" //设置http.Header
// "net/url"
// 编码包
// "encoding/xml"
// "encoding/json"
// 字符串处理包
"regexp"
"strconv"
"strings"
// 其他包
// "fmt"
// "math"
// "time"
)
func init() {
Miyabaobei.AddMenu()
}
var Miyabaobei = &Spider{
Name: "蜜芽宝贝",
Description: "蜜芽宝贝商品数据 [Auto Page] [www.miyabaobei.com]",
// Pausetime: [2]uint{uint(3000), uint(1000)},
// Keyword: USE,
UseCookie: false,
RuleTree: &RuleTree{
Root: func(self *Spider) {
self.AddQueue(map[string]interface{}{"Url": "http://www.miyabaobei.com/", "Rule": "获取版块URL"})
},
Trunk: map[string]*Rule{
"获取版块URL": {
ParseFunc: func(self *Spider, resp *context.Response) {
query := resp.GetDom()
lis := query.Find(".ccon")
lis.Each(func(i int, s *goquery.Selection) {
s.Find("a").Each(func(n int, ss *goquery.Selection) {
if url, ok := ss.Attr("href"); ok {
if !strings.Contains(url, "http://www.miyabaobei.com") {
url = "http://www.miyabaobei.com" + url
}
self.Aid("生成请求", map[string]interface{}{
"loop": [2]int{0, 1},
"urlBase": url,
"req": map[string]interface{}{
"Rule": "生成请求",
"Temp": map[string]interface{}{"baseUrl": url},
},
})
}
})
})
},
},
"生成请求": {
AidFunc: func(self *Spider, aid map[string]interface{}) interface{} {
req := aid["req"].(map[string]interface{})
for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
req["Url"] = aid["urlBase"].(string) + "&per_page=" + strconv.Itoa(loop[0]*40)
self.AddQueue(req)
}
return nil
},
ParseFunc: func(self *Spider, resp *context.Response) {
query := resp.GetDom()
totalPage := "1"
urls := query.Find(".Lpage.page p a")
if urls.Length() != 0 {
if urls.Last().Text() == ">" {
totalPage = urls.Eq(urls.Length() - 2).Text()
} else {
totalPage = urls.Last().Text()
}
}
total, _ := strconv.Atoi(totalPage)
// 调用指定规则下辅助函数
self.Aid("生成请求", map[string]interface{}{
"loop": [2]int{1, total},
"ruleBase": resp.GetTemp("baseUrl").(string),
"rep": map[string]interface{}{
"Rule": "商品列表",
},
})
// 用指定规则解析响应流
self.Parse("商品列表", resp)
},
},
"商品列表": {
//注意:有无字段语义和是否输出数据必须保持一致
OutFeild: []string{
"标题",
"价格",
"类别",
},
ParseFunc: func(self *Spider, resp *context.Response) {
query := resp.GetDom()
//获取品类
goodsType := query.Find(".crumbs").Text()
re, _ := regexp.Compile("\\s")
goodsType = re.ReplaceAllString(goodsType, "")
re, _ = regexp.Compile("蜜芽宝贝>")
goodsType = re.ReplaceAllString(goodsType, "")
query.Find(".bmfo").Each(func(i int, s *goquery.Selection) {
// 获取标题
title, _ := s.Find("p a").First().Attr("title")
// 获取价格
price := s.Find(".f20").Text()
// 结果存入Response中转
resp.AddItem(map[string]interface{}{
self.OutFeild(resp, 0): title,
self.OutFeild(resp, 1): price,
self.OutFeild(resp, 2): goodsType,
})
})
},
},
},
},
}