京东bra评论数据合集

京东bra评论数据合集

都市丽人

累计爬取 bra 900 件

累计数据 130688 条

欧迪芬

累计爬取 bra 300 件

累计数据 44577 条

代码

数据库交互

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#dbUtil.py
import pymysql

class db:
def __init__(self):
self.host="192.168.2.216"
self.user="root"
self.password='12134'
self.dbName='test'
self.port=3306
try:
db = pymysql.connect(host=self.host,user=self.user,password=self.password,db=self.dbName, port=self.port)
except pymysql.err.OperationalError:
print("数据库连接错误")
exit(1)
except:
print("数据库连接其他错误 退出")
exit(1)
self.cur = db.cursor()
self.db = db


def close(self):
self.db.close()
self.cur.close()

def execute(self, sql):
print(sql)
try:
self.cur.execute(sql)
self.db.commit()
try:
result = self.cur.fetchall()
return result
except:
print("执行成功 但返回失败")
except pymysql.err.ProgrammingError:
print("语法错误")
exit(1)
try:
self.db.rollback()
except:
pass
return False
except:
print("执行语句其他错误,退出")
exit(1)
try:
self.db.rollback()
except:
pass
return False


#def rBTeble(self):
#sql = "drop table BRA;"
#self.execute(sql)
#sql = 'create table BRA(ID int unsigned not null auto_increment primary key,BRAID char(20) not null,BRANAME char(50) not null,BRAURL char(50) not null,COMID char(20) not null,COMMENT char(100) not null,SIZE char(10) not null,COLOR char(10) not null,DATE char(10) not null,CLIENT char(20) not null);'
#self.execute(sql)

#def inserDB(self, bra):
#sql = "INSERT INTO BRA VALUES(NULL,'" + bra.braId + "','" + bra.braName + "','" + bra.braUrl + "','" + bra.comId + "','" + bra.comment+ "','" + bra.size + "','" + bra.color + "','" + bra.date + "','" + bra.client + "');"
#self.execute(sql)

#def selectAll(self):
#sql = "SELECT * FROM BRA;"
#return self.execute(sql)

def insertBraUrl(self,b):
sql = "INSERT INTO BRAURL VALUES(null,'" + b[1] + "','" + b[0] +"','" + b[2] + "');"
self.execute(sql)

def getBraUrl(self):
sql = "SELECT * FROM BRAURL;"
return self.execute(sql)

def getNewBraUrl(self):
sql = "SELECT * FROM BRAURL WHERE ID > 3387;"
return self.execute(sql)

def inserDSLR(self, bra):
sql = "INSERT INTO DSLR VALUES(NULL,'" + bra.braId + "','" + bra.braName + "','" + bra.braUrl + "','" + bra.comId + "','" + bra.comment+ "','" + bra.size + "','" + bra.color + "','" + bra.date + "','" + bra.client + "');"
self.execute(sql)

def inserODF(self, bra):
sql = "INSERT INTO ODF VALUES(NULL,'" + bra.braId + "','" + bra.braName + "','" + bra.braUrl + "','" + bra.comId + "','" + bra.comment+ "','" + bra.size + "','" + bra.color + "','" + bra.date + "','" + bra.client + "');"
self.execute(sql)

获取bra链接并存入数据库

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import requests
from bs4 import BeautifulSoup
import re
from dbUtil import db

headers={
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7,zh-TW;q=0.6',
'Connection': 'keep-alive',
'Cookie': 'ipLoc-djd=1-72-2799-0; shshshfpa=d6cf12e1-dcd2-7633-be04-9f3803b514f2-1531739718; shshshfpb=0435fca42478742d4f326897399c5e65c81027296a0ccf3055b4c7e47e; ipLocation=%u5317%u4EAC; __jdu=936337386; areaId=1; __jdc=122270672; __jdv=122270672|baidu|-|organic|not set|1533172847247; PCSYCityID=1898; mt_xid=V2_52007VwsQVV9aVFgWTilVUm9UEwdYXU5bGR1KQABnVxdOVFhSWANOTlpWblEVUFgLAA0vShhcDHsCG05cW0NbF0IcWA5jBCJQbVhiWRxJEFsDZwoTYl1dVF0%3D; user-key=f58d99da-6bbe-4210-9a85-e4f58e3efd33; cn=0; 3AB9D23F7A4B3C9B=35JEID4ZBRCPPFQSHVZ3KH3SHSAHGMUSESFMACOI4GG2OB6LXLVMJQ4E6EVQS2324VQ6SJMXRLPXVNTREKH5MQUGVU; __jda=122270672.936337386.1531739721.1533180210.1533186701.4; __jdb=122270672.1.936337386|4.1533186701; shshshfp=bcf9f24b4397c578b4f0556b6e5db401; shshshsID=66cc5eea217193cc71fe768bca87165c_1_1533186701212',
'Referer': 'https://mall.jd.com/view_search-670138-6304094-99-1-24-1.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'
}


def getUrl(page):
return 'https://list.jd.com/list.html?cat=1315,1345,1364&ev=exbrand_90320&page='+str(page)+'&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main' #都市丽人

def request():
for i in range(1,16): #循环一次获取 60 件bra信息
url = getUrl(i)
print(url)
r = requests.get(url=url, headers=headers)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text,'lxml')
p_names = soup.select('.p-name')
for p_name in p_names:
url = p_name.find('a').get('href')
bra_id = re.findall(r"\d+",url)[0]
bra_name = p_name.find('em')
if bra_name==None:
continue
yield ('https:' + url, bra_name.text.replace(' ','').replace('\n',''), bra_id)

db = db()
for i in request():
db.insertBraUrl(i)
db.close()

获取评论

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import requests
from bs4 import BeautifulSoup
import json
import re
import time

class getComment:
def __init__(self):
self.headers={
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7,zh-TW;q=0.6',
'Connection': 'keep-alive',
'Cookie': 'ipLoc-djd=1-72-2799-0; shshshfpa=d6cf12e1-dcd2-7633-be04-9f3803b514f2-1531739718; shshshfpb=0435fca42478742d4f326897399c5e65c81027296a0ccf3055b4c7e47e; ipLocation=%u5317%u4EAC; __jdu=936337386; areaId=1; __jdc=122270672; __jdv=122270672|baidu|-|organic|not set|1533172847247; PCSYCityID=1898; mt_xid=V2_52007VwsQVV9aVFgWTilVUm9UEwdYXU5bGR1KQABnVxdOVFhSWANOTlpWblEVUFgLAA0vShhcDHsCG05cW0NbF0IcWA5jBCJQbVhiWRxJEFsDZwoTYl1dVF0%3D; user-key=f58d99da-6bbe-4210-9a85-e4f58e3efd33; cn=0; 3AB9D23F7A4B3C9B=35JEID4ZBRCPPFQSHVZ3KH3SHSAHGMUSESFMACOI4GG2OB6LXLVMJQ4E6EVQS2324VQ6SJMXRLPXVNTREKH5MQUGVU; __jda=122270672.936337386.1531739721.1533180210.1533186701.4; __jdb=122270672.1.936337386|4.1533186701; shshshfp=bcf9f24b4397c578b4f0556b6e5db401; shshshsID=66cc5eea217193cc71fe768bca87165c_1_1533186701212',
'Referer': 'https://mall.jd.com/view_search-670138-6304094-99-1-24-1.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'
}

def getUrl(self,id,i):
url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv7244&productId=' + id + '&score=0&sortType=5&page=' + str(i) +'&pageSize=10&isShadowSku=0&rid=0&fold=1'
# print(url)
return url

def requ(self,id):
for i in range(0,1000): #循环一次 获取 10 条评论
url = self.getUrl(id,i)
response = requests.get(url = url, headers = self.headers)
time.sleep(4)
response.encoding = 'GBK'
try:
js = response.text[26:-2]
comments = json.loads(js)['comments']
if comments == []:
break
for comment in comments:
item = {}
item['color'] = comment['productColor']
item['content'] =comment['content']
item['guid'] = comment['guid']
item['time'] = comment['referenceTime']
item['size'] = comment['productSize']
item['client'] = comment['userClientShow']
item['content'] = item['content'].replace("'", '')
# print(item)
yield item
except:
print('获取评论出错了')
return

从数据库取出 bra 获取评论 再存入数据库

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from dbUtil import db
from Utils import bra
from getComment import getComment
import time
from dbUtil import db


getComment = getComment()
db = db()
BraInfo = db.getNewBraUrl()

for a in BraInfo:
br = bra(None,None,None,None,None,None,None,None,None)
br.braName = a[1]
br.braId = a[3]
br.braUrl = a[2]
print(a[3])
for item in getComment.requ(a[3]):
br.client = item['client']
br.comment = item['content']
br.color = item['color']
# print(br.color)
br.comId = item['guid']
br.date = item['time']
br.size = item['size']
# db.inserDSLR(br) #都市丽人
db.inserODF(br) #欧迪芬
time.sleep(4)
db.close()

其它工具

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
class bra:
def __init__(self, braId, braName, braUrl,comId,size,date,client,color,comment):
self.braId = braId
self.braName = braName
self.braUrl = braUrl
self.comId = comId
self.size = size
self.date = date
self.client = client
self.color = color
self.comment = comment

def __str__(self):
print( self.braId)
print(self.braName)
print(self.braUrl)
print(self.size)
print(self.date)
print(self.client)

MySQL 建表语句

1
2
3
4
5
6
/*BRAURL*/
create table BRAURL(ID int unsigned not null auto_increment primary key,NAME char(50) not null,URL char(50) not null,GOODSID char(20) not null);
/*DSLR*/
create table DSLR(ID int unsigned not null auto_increment primary key,BRAID char(20) not null,BRANAME char(50) not null,BRAURL char(50) not null,COMID char(20) not null,COMMENT char(100) not null,SIZE char(10) not null,COLOR char(10) not null,DATE char(10) not null,CLIENT char(20) not null);
/*ODF*/
create table ODF(ID int unsigned not null auto_increment primary key,BRAID char(20) not null,BRANAME char(50) not null,BRAURL char(50) not null,COMID char(20) not null,COMMENT char(100) not null,SIZE char(10) not null,COLOR char(10) not null,DATE char(10) not null,CLIENT char(20) not null);

部分数据分析如下

都市丽人客户端

都市丽人 SIZE

欧迪芬 客户端

欧迪芬 SIZE

汇总

size

Client

------ end ------