1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
| from __future__ import division import sys,requests,re,json,sqlite3 from prettytable import PrettyTable as pt
def init_db(): conn = sqlite3.connect('housing.db') db_cursor = conn.cursor() db_cursor.execute('''CREATE TABLE IF NOT EXISTS housing (id INT PRIMARY KEY NOT NULL, name TEXT NOT NULL, area REAL , ve REAL , price REAL);''') conn.commit() return conn,db_cursor
def ve_convert(GREEN_COVERAGE): if isinstance(GREEN_COVERAGE, float): return GREEN_COVERAGE / 100 if isinstance(GREEN_COVERAGE, unicode): no_sign = GREEN_COVERAGE.replace(u'%','') no_sign = no_sign.replace(u'以上','').encode('utf-8') return float(no_sign)/100
def com():
list_page_url = 'http://hangzhou.fangtoo.com/building/cp' sector_page_url = 'http://hangzhou.fangtoo.com/building/'
name_id_filter = ur'<a href=["]http://hangzhou.fangtoo.com/building/(.*)/["] target=["]_blank["] title=["](.*)["] target=["]_blank["]>' price_filter = ur'<span class=["]fontS30 Cred["]>(.*)</span>' area_filter = ur'<li>占地面积:(\d.*)平方米</li>' ve_filter = ur'<li>绿化率:(.*)</li>'
table_title = ["NO.","id","Name","Area","Green Coverage","Price"]
x = pt(table_title)
conn,db_cursor = init_db()
print('\033[2J\033[HConnected to database. Requesting data from the Internet...') for sector_count in range(1,318): print '\nSector',sector_count,'of 317\n' list_page = requests.get(url = list_page_url + str(sector_count)) name_id = re.findall(name_id_filter,list_page.text) price = re.findall(price_filter,list_page.text) x.padding_width = 1 for i in range(len(name_id)): sector_page = requests.get(url = sector_page_url + str(name_id[i-1][0])) sector_length = len(name_id) area = re.findall(area_filter,sector_page.text) ve = re.findall(ve_filter,sector_page.text) if area == []: area = [('--')] CODE = str(i+1+(sector_count-1)*sector_length) ID = name_id[i][0].encode('utf-8') NAME = name_id[i][1].encode('utf-8') AREA = area[0].replace(u'万','0000').encode('utf-8') GREEN_COVERAGE = ve[0].replace(u'。','').encode('utf-8') PRICE = price[i].replace(u'万','0000').encode('utf-8') table_data = ([CODE.zfill(4),ID,NAME,AREA,GREEN_COVERAGE,PRICE]) print CODE.zfill(4),ID,NAME,AREA,GREEN_COVERAGE,PRICE x.add_row(table_data)
GREEN_COVERAGE = ve_convert(GREEN_COVERAGE)
if '--' not in (CODE,NAME,AREA,GREEN_COVERAGE,PRICE): db_interface = (int(CODE),NAME,float(AREA),GREEN_COVERAGE,float(PRICE)) db_cursor.execute("INSERT INTO housing VALUES \ (%d, '%s', %f, '%s', %f);"%(db_interface)) conn.commit() print x
conn.close() print('All requests successfully recorded.')
def main(args): com()
if __name__ == '__main__': sys.exit(main(sys.argv))`</pre>
|