def database_init(dbflag='local'):
if dbflag=='local':
conn = mysql.connector.connect(user='root', password='password', database='lianjiaSpider',host='localhost')
else:
conn = mysql.connector.connect(user='user', password='password', database='qdm1940_db',host='qdm1940.my3w.com')
dbc = conn.cursor()
dbc.execute('create table if not exists houseinfo (id int(10) NOT NULL AUTO_INCREMENT primary key,houseID varchar(50) , Title varchar(200), link varchar(200), cellname varchar(100),\
years varchar(200),housetype varchar(50),square varchar(50), direction varchar(50),floor varchar(50),taxtype varchar(200), \
totalPrice varchar(200), unitPrice varchar(200),followInfo varchar(200),validdate varchar(50),validflag varchar(20))')
dbc.execute('create table if not exists hisprice (id int(10) NOT NULL AUTO_INCREMENT primary key,houseID varchar(50) , date varchar(50), totalPrice varchar(200))')
conn.commit()
dbc.close()
return conn
def houseinfo_insert_mysql(conn,info_dict):
info_list = [u'houseID',u'Title',u'link',u'cellname',u'years',u'housetype',u'square',u'direction',u'floor',\
u'taxtype',u'totalPrice',u'unitPrice',u'followInfo',u'validdate',u'validflag']
t=[]
for il in info_list:
if il in info_dict:
t.append(info_dict[il])
else:
t.append('')
t=tuple(t)
today = get_today()
t2=(info_dict[u'houseID'],today,info_dict[u'totalPrice'])
cursor = conn.cursor()
cursor.execute('select * from houseinfo where houseID = (%s)',(info_dict[u'houseID'],))
values = cursor.fetchall()
if len(values)>0:
nvs = zip(info_list,list(values[0][1:]))
Qres = dict( (info_list,value) for info_list,value in nvs)
else:
pass
if len(values)==0:
cursor.execute('insert into houseinfo (houseID,Title,link,cellname,years,housetype,square,direction,floor,\
taxtype,totalPrice,unitPrice,followInfo,validdate,validflag) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', t)
cursor.execute('insert into hisprice (houseID,date,totalPrice) values (%s,%s,%s)', t2)
else:
if int(today)>int(Qres[u'validdate']):
cursor.execute('update houseinfo set validdate = %s,validflag= %s where houseid = %s',(today,'1',info_dict[u'houseID']))
cursor.execute('insert into hisprice (houseID,date,totalPrice) values (%s,%s,%s)', t2)
else:
cursor.execute('update houseinfo set validflag= %s where houseid = %s',('1',info_dict[u'houseID']))
cursor.execute('update hisprice set totalPrice= %s where houseid = %s',(info_dict[u'totalPrice'],info_dict[u'houseID']))
if int(Qres[u'totalPrice']) != int(info_dict[u'totalPrice']):
info_dict[u'oldprice'] = Qres[u'totalPrice']
conn.commit()
cursor.close()
def trigger_notify_email(info_dict,reason='newhouse'):
if reason == 'newhouse':
title = u'新上房源' + ' ' + info_dict[u'cellname'] + ' ' + info_dict[u'housetype'] + ' ' + info_dict[u'totalPrice']+'万'
cotent_txt = u'新上房源' + ' ' + info_dict[u'cellname'] + ' ' + info_dict[u'housetype'] + ' 朝向' + info_dict[u'direction'] + \
' 面积' + info_dict[u'square'] + ' 总价' + info_dict[u'totalPrice']+'万 单价' + info_dict[u'unitPrice']+'万每平米' \
+ u' 房源编号' + info_dict[u'houseID'] + ' ' + u'查看链接 ' + info_dict[u'link']
content_html = '<html><body><h1>新上房源</h1>' + '<p>' + info_dict[u'cellname'] + ' ' + info_dict[u'housetype'] + \
' 朝向' + info_dict[u'direction'] + ' 面积' + info_dict[u'square'] + ' 总价' + info_dict[u'totalPrice'] + \
'万 单价' + info_dict[u'unitPrice']+'万每平米' + u' 房源编号' + info_dict[u'houseID'] + ' ' + u'点击查看链接 ' + \
'<a href=\"'+ info_dict[u'link'] + '\">' + info_dict[u'link'] + '</a>...</p>' + '</body></html>'
else:
title = u'房屋价格变动' + ' ' + info_dict[u'cellname'] + ' ' + info_dict[u'housetype'] + info_dict[u'oldprice'] + '----> ' + info_dict[u'totalPrice']+'万'
cotent_txt = u'房屋价格变动' + ' ' + info_dict[u'cellname'] + ' ' + info_dict[u'housetype'] + ' 朝向' + info_dict[u'direction'] + \
' 面积' + info_dict[u'square'] + ' 总价 ' + info_dict[u'oldprice']+'万 ' + '----> ' + info_dict[u'totalPrice']+'万 单价' + \
info_dict[u'unitPrice']+'万每平米' + u' 房源编号' + info_dict[u'houseID'] + ' ' + u'查看链接 ' + info_dict[u'link']
content_html = '<html><body><h1>房屋价格变动</h1>' + '<p>' + info_dict[u'cellname'] + ' ' + info_dict[u'housetype'] + \
' 朝向' + info_dict[u'direction'] + ' 面积' + info_dict[u'square'] + ' 总价 ' + info_dict[u'oldprice']+'万 ' \
+ '----> '+ info_dict[u'totalPrice'] + '万 单价' + info_dict[u'unitPrice']+'万每平米' + u' 房源编号' + \
info_dict[u'houseID'] + ' ' + u'点击查看链接 ' + '<a href=\"'+ info_dict[u'link'] + '\">' + info_dict[u'link'] + \
'</a>...</p>' + '</body></html>'
from_addr = 'ziyubiti@qq.com'
password = 'password'
to_addr = 'ziyubiti@139.com'
smtp_server = 'smtp.qq.com'
msg = MIMEMultipart('alternative')
msg.attach(MIMEText(cotent_txt, 'plain', 'utf-8'))
msg.attach(MIMEText(content_html, 'html', 'utf-8'))
msg['From'] = _format_addr(u'紫雨 <%s>' % from_addr)
msg['To'] = _format_addr(u'紫雨 <%s>' % to_addr)
msg['Subject'] = Header(title, 'utf-8').encode()
smtp_port = 465
server = smtplib.SMTP(smtp_server, smtp_port)
server.starttls()
server.set_debuglevel(1)
server.login(from_addr, password)
server.sendmail(from_addr, [to_addr], msg.as_string())
server.quit()
def _format_addr(s):
name, addr = parseaddr(s)
return formataddr((Header(name, 'utf-8').encode(), addr))
def all_set_unvalid(conn):
cursor = conn.cursor()
cursor.execute('update houseinfo set validflag= %s',('0',))
conn.commit()
cursor.close()
def house_percell_spider(conn,cellname = u'荣丰2008'):
url=u"http://bj.lianjia.com/ershoufang/rs" + quote(cellname) +"/"
req = Request(url,headers=hds[random.randint(0,len(hds)-1)])
source_code = urlopen(req,timeout=5).read()
soup = BeautifulSoup(source_code,'lxml')
total_pages = 0
page_info= soup.find('div',{'class':'page-box house-lst-page-box'}).get('page-data').split(',')[0]
total_pages= int(page_info[-1])
info_dict_all = {}
for page in range(total_pages):
if page>0:
url_page=u"http://bj.lianjia.com/ershoufang/pg%drs%s/" % (page+1,quote(cellname))
req = Request(url_page,headers=hds[random.randint(0,len(hds)-1)])
source_code = urlopen(req,timeout=50).read()
soup = BeautifulSoup(source_code,'lxml')
nameList = soup.findAll("li", {"class":"clear"})
i = 0
for name in nameList:
i = i + 1
info_dict = {}
info_dict_all.setdefault(i+page*30,{})
housetitle = name.find("div",{"class":"title"})
info_dict.update({u'Title':housetitle.get_text()})
info_dict.update({u'link':housetitle.a.get('href')})
houseaddr = name.find("div",{"class":"address"})
info = houseaddr.div.get_text().split('|')
info_dict.update({u'cellname':info[0]})
info_dict.update({u'housetype':info[1]})
info_dict.update({u'square':info[2]})
info_dict.update({u'direction':info[3]})
housefloor = name.find("div",{"class":"flood"})
floor_all = housefloor.div.get_text().split('-')[0].strip().split(' ')
info_dict.update({u'floor':floor_all[0]})
info_dict.update({u'years':floor_all[-1]})
followInfo = name.find("div",{"class":"followInfo"})
info_dict.update({u'followInfo':followInfo.get_text()})
tax = name.find("div",{"class":"tag"})
info_dict.update({u'taxtype':tax.get_text()})
totalPrice = name.find("div",{"class":"totalPrice"})
info_dict.update({u'totalPrice':totalPrice.span.get_text()})
unitPrice = name.find("div",{"class":"unitPrice"})
info_dict.update({u'unitPrice':unitPrice.get('data-price')})
info_dict.update({u'houseID':unitPrice.get('data-hid')})
today = get_today()
info_dict.update({u'validdate':today})
info_dict.update({u'validflag':str('1')})
houseinfo_insert_mysql(conn,info_dict)
info_dict_all[i+page*30] = info_dict
return info_dict_all
def house_celllist_spider(conn,celllist = [u'荣丰2008',u'保利茉莉公馆']):
all_set_unvalid(conn)
for cellname in celllist:
house = house_percell_spider(conn,cellname)
return house