您现在的位置是:Python抓取订单页面

Python抓取订单页面

Python 日期:2019-08-09点击:998
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
import re

from lxml import etree
import html

CITY = 'city'
USERNAME = 'name'
PASSWORD = 'password'

PAGE_START = 1
PAGE_END = 1100


# 登录缓存
def getSession(username, password):
    # 登录页
    LOGIN_URL = 'http://www.test.com/index.php?ajax=1'
    # 账号密码
    DATA = {"com":'com_passport',"method":'dologin',"ID":username,"PWD":password,"checkbox":'on'}
    # 浏览器
    HEADERS = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    # 保存登录参数
    ROOM_SESSION  = requests.Session()
    ROOM_SESSION.post(LOGIN_URL,data=DATA,headers=HEADERS)
    return ROOM_SESSION

# 获取列表
def getPageUrl(num):
    PAGE_BASE = 'http://www.test.com/index.php?method=index&app=order&page='
    return PAGE_BASE + str(num)
    
# 保存文件
def saveFile(text, name):
    fo = open(name, "w")
    fo.write(text)
    fo.close()


# 错误记录
def addLog(type, text):
    ERR_FILE = CITY + '_' + type + '.txt'
    ferror = open(ERR_FILE, "a")
    ferror.write(text)
    ferror.write("\n")
    ferror.close()

## 获取详情
def getOrder(text):
    order_text = ''
    html_text = etree.HTML(text)
    contents = html_text.xpath("//div[@class='order-content']")
    # 遍历订单
    for index in range(len(contents)) :        
        div_str = etree.tostring(contents[index]).decode('utf-8')
        div_str = html.unescape(div_str)
        order_text += div_str
        order_text += "\n"
    return order_text


##########################################################

# 城市文件
fcity = open(CITY + '_index.html', "w")
fcity.write("<link rel='stylesheet' type='text/css' href='style.css'>\n")


# 登录
SESSION =getSession(USERNAME, PASSWORD)



# 循环
for num in range(PAGE_START, PAGE_END):
    page = str(num)
    PAGE_FILE = CITY + '/page_' + page.zfill(5) + '.html'
    PAGE_URL = getPageUrl(page)
    PAGE_RES = SESSION.get(PAGE_URL)
    # 日志
    addLog('log', PAGE_URL)
    print(PAGE_URL)
    print(PAGE_RES.status_code)
    # 获取页面
    if(PAGE_RES.status_code!=200):
        addLog('error', '获取第'+ page +'页失败')
    else:
        addLog('success', '获取第'+ page +'页成功')
        # 保存文件
        PAGE_TEXT = PAGE_RES.text
        saveFile(PAGE_TEXT,PAGE_FILE)
        # 获取订单
        ORDER_TEXT = getOrder(PAGE_TEXT)
        fcity.write("\n第" + page + "页\n")
        fcity.write(ORDER_TEXT)
        if(ORDER_TEXT == ''):
            fcity.write("获取内容失败")
            addLog('error', '获取第'+ page +'页详情失败')
标签: Python

站长简介

姓名:喻理
微信:yuli0927
邮箱:yuli0927@126.com
不懂业务的运维工程师不是一个好程序员。

分类

最新文章

热门文章