您现在的位置是:Python抓取订单页面
Python抓取订单页面
#!/usr/bin/python # -*- coding: UTF-8 -*- import requests import re from lxml import etree import html CITY = 'city' USERNAME = 'name' PASSWORD = 'password' PAGE_START = 1 PAGE_END = 1100 # 登录缓存 def getSession(username, password): # 登录页 LOGIN_URL = 'http://www.test.com/index.php?ajax=1' # 账号密码 DATA = {"com":'com_passport',"method":'dologin',"ID":username,"PWD":password,"checkbox":'on'} # 浏览器 HEADERS = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} # 保存登录参数 ROOM_SESSION = requests.Session() ROOM_SESSION.post(LOGIN_URL,data=DATA,headers=HEADERS) return ROOM_SESSION # 获取列表 def getPageUrl(num): PAGE_BASE = 'http://www.test.com/index.php?method=index&app=order&page=' return PAGE_BASE + str(num) # 保存文件 def saveFile(text, name): fo = open(name, "w") fo.write(text) fo.close() # 错误记录 def addLog(type, text): ERR_FILE = CITY + '_' + type + '.txt' ferror = open(ERR_FILE, "a") ferror.write(text) ferror.write("\n") ferror.close() ## 获取详情 def getOrder(text): order_text = '' html_text = etree.HTML(text) contents = html_text.xpath("//div[@class='order-content']") # 遍历订单 for index in range(len(contents)) : div_str = etree.tostring(contents[index]).decode('utf-8') div_str = html.unescape(div_str) order_text += div_str order_text += "\n" return order_text ########################################################## # 城市文件 fcity = open(CITY + '_index.html', "w") fcity.write("<link rel='stylesheet' type='text/css' href='style.css'>\n") # 登录 SESSION =getSession(USERNAME, PASSWORD) # 循环 for num in range(PAGE_START, PAGE_END): page = str(num) PAGE_FILE = CITY + '/page_' + page.zfill(5) + '.html' PAGE_URL = getPageUrl(page) PAGE_RES = SESSION.get(PAGE_URL) # 日志 addLog('log', PAGE_URL) print(PAGE_URL) print(PAGE_RES.status_code) # 获取页面 if(PAGE_RES.status_code!=200): addLog('error', '获取第'+ page +'页失败') else: addLog('success', '获取第'+ page +'页成功') # 保存文件 PAGE_TEXT = PAGE_RES.text saveFile(PAGE_TEXT,PAGE_FILE) # 获取订单 ORDER_TEXT = getOrder(PAGE_TEXT) fcity.write("\n第" + page + "页\n") fcity.write(ORDER_TEXT) if(ORDER_TEXT == ''): fcity.write("获取内容失败") addLog('error', '获取第'+ page +'页详情失败')
随便看看
站长简介
分类
最新文章
热门文章
- 微信支付退款结果通知解密
- 到ICANN投诉三五互联获取域名转移密码成功
- Linux修改密码提示passwd /usr/share/cracklib/pw_dict: error reading header
- {"errcode":45047,"errmsg":"out of response count limit hint]"}
- 微信html弹出点击右上角分享到朋友圈
- Apache禁用TRACE Method和添加X-frame-options响应头
- PHP将B转换为KB、MB、GB
- 构建微信公众号被动回复image图片消息xml
- mysqld_safe Number of processes running now
- CentOS使用chkconfig提示systemctl enable xxxx.service