本站即日起正式部署SSL证书启用https安全连接, 联系站长:416326801#qq.com(#改成@)

Latest from the blog

python获取目录下的最新文件或文件夹

标签: Python

def new_report(test_report):
    lists = os.listdir(test_report)                                    #列出目录的下所有文件和文件夹保存到lists
    print(list)
    lists.sort(key=lambda fn:os.path.getmtime(test_report + "\\" + fn))#按时间排序
    file_new = os.path.join(test_report,lists[-1])                     #获取最新的文件保存到file_new
    print(file_new)
    return file_new
if __name__=="__main__":
    test_report="path"#目录地址
    new_report(test_report)

Python分析网页并下载图片

标签: Python

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
import re
import os
import urllib.request

from lxml import etree
import html

ROOT_DIR = 'dalian/'
IMG_DIR = ROOT_DIR + 'images/'

## 提取图片
def openOrder(ORDER_FILE):
    print(ORDER_FILE)
    # 打开文件
    fo = open(ORDER_FILE, "r")
    FILE_TEXT = fo.read()
    fo.close()
    # 提取图片地址
    html_text = etree.HTML(FILE_TEXT)
    #img_data = html_text.xpath("//div[class='imgList']/a/@href")
    img_data = html_text.xpath("//a/img//@src")
    for IMG_URL in img_data:
        print(IMG_URL)
        IMG_NAME = os.path.basename(IMG_URL)
        print(IMG_NAME)
        # 判断是否200
        request = requests.get(IMG_URL)
        httpStatusCode = request.status_code
        if(httpStatusCode == 200):
            urllib.request.urlretrieve(IMG_URL, IMG_DIR + IMG_NAME)
        #str=etree.tostring(index)
        #print(str)


# 循环查找文件
for maindir, subdir, file_name_list in os.walk(ROOT_DIR):
    for file_name in file_name_list:
        # print(file_name)
        if(file_name.startswith('app')):
            # print("YYYYYY")
            # 完整路径
            ORDER_FILE = os.path.join(maindir, file_name)
            openOrder(ORDER_FILE)
        #else:
            # print("NNNNNN")

Python分析网页并抓取内容

标签: Python

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
import re
import os

from lxml import etree
import html

CITY = 'city'
USERNAME = 'user'
PASSWORD = 'name'

# 登录缓存
def getSession(username, password):
    # 登录页
    LOGIN_URL = 'http://www.test.com/index.php?ajax=1'
    # 账号密码
    DATA = {"com":'com_passport',"method":'dologin',"ID":username,"PWD":password,"checkbox":'on'}
    # 浏览器
    HEADERS = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    # 保存登录参数
    ROOM_SESSION  = requests.Session()
    ROOM_SESSION.post(LOGIN_URL,data=DATA,headers=HEADERS)
    return ROOM_SESSION

# 获取列表
def getPageUrl(num):
    PAGE_BASE = 'http://www.test.com/index.php?method=index&app=order&page='
    return PAGE_BASE + str(num)
    
# 保存文件
def saveFile(text, name):
    fo = open(name, "w")
    fo.write(text)
    fo.close()


# 错误记录
def addLog(type, text):
    ERR_FILE = CITY + '_' + type + '.txt'
    ferror = open(ERR_FILE, "a")
    ferror.write(text)
    ferror.write("\n")
    ferror.close()



##########################################################

# 城市文件
FILE_NAME = CITY + '_index.html'
# 打开文件
fo = open(FILE_NAME, 'r', encoding='utf-8')
text = fo.read()
fo.close()


# 登录
SESSION =getSession(USERNAME, PASSWORD)


# 提取URL
html = etree.HTML(text)
url_data = html.xpath('//a/@href')
for ORDER_URL in url_data:
    ORDER_RES = SESSION.get(ORDER_URL)
    # 日志
    addLog('order_log', ORDER_URL)
    print(ORDER_URL)
    print(ORDER_RES.status_code)
    # 获取页面
    if(ORDER_RES.status_code!=200):
        addLog('order_error', '获取'+ ORDER_URL +'失败')
    else:
        addLog('success', '获取'+ ORDER_URL +'成功')
        ORDER_TEXT = ORDER_RES.text
        matchObj = re.search(r"app=order(.*)", ORDER_URL)
        ORDER_NAME = matchObj.group()
        ORDER_FILE = CITY + '/' + ORDER_NAME + '.html'
        # 保存文件
        saveFile(ORDER_TEXT,ORDER_FILE)
        print(ORDER_FILE)

Python抓取订单页面

标签: Python

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
import re

from lxml import etree
import html

CITY = 'city'
USERNAME = 'name'
PASSWORD = 'password'

PAGE_START = 1
PAGE_END = 1100


# 登录缓存
def getSession(username, password):
    # 登录页
    LOGIN_URL = 'http://www.test.com/index.php?ajax=1'
    # 账号密码
    DATA = {"com":'com_passport',"method":'dologin',"ID":username,"PWD":password,"checkbox":'on'}
    # 浏览器
    HEADERS = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    # 保存登录参数
    ROOM_SESSION  = requests.Session()
    ROOM_SESSION.post(LOGIN_URL,data=DATA,headers=HEADERS)
    return ROOM_SESSION

# 获取列表
def getPageUrl(num):
    PAGE_BASE = 'http://www.test.com/index.php?method=index&app=order&page='
    return PAGE_BASE + str(num)
    
# 保存文件
def saveFile(text, name):
    fo = open(name, "w")
    fo.write(text)
    fo.close()


# 错误记录
def addLog(type, text):
    ERR_FILE = CITY + '_' + type + '.txt'
    ferror = open(ERR_FILE, "a")
    ferror.write(text)
    ferror.write("\n")
    ferror.close()

## 获取详情
def getOrder(text):
    order_text = ''
    html_text = etree.HTML(text)
    contents = html_text.xpath("//div[@class='order-content']")
    # 遍历订单
    for index in range(len(contents)) :        
        div_str = etree.tostring(contents[index]).decode('utf-8')
        div_str = html.unescape(div_str)
        order_text += div_str
        order_text += "\n"
    return order_text


##########################################################

# 城市文件
fcity = open(CITY + '_index.html', "w")
fcity.write("<link rel='stylesheet' type='text/css' href='style.css'>\n")


# 登录
SESSION =getSession(USERNAME, PASSWORD)



# 循环
for num in range(PAGE_START, PAGE_END):
    page = str(num)
    PAGE_FILE = CITY + '/page_' + page.zfill(5) + '.html'
    PAGE_URL = getPageUrl(page)
    PAGE_RES = SESSION.get(PAGE_URL)
    # 日志
    addLog('log', PAGE_URL)
    print(PAGE_URL)
    print(PAGE_RES.status_code)
    # 获取页面
    if(PAGE_RES.status_code!=200):
        addLog('error', '获取第'+ page +'页失败')
    else:
        addLog('success', '获取第'+ page +'页成功')
        # 保存文件
        PAGE_TEXT = PAGE_RES.text
        saveFile(PAGE_TEXT,PAGE_FILE)
        # 获取订单
        ORDER_TEXT = getOrder(PAGE_TEXT)
        fcity.write("\n第" + page + "页\n")
        fcity.write(ORDER_TEXT)
        if(ORDER_TEXT == ''):
            fcity.write("获取内容失败")
            addLog('error', '获取第'+ page +'页详情失败')