python網頁爬蟲開發，下載表特板熱門文章中的圖片存於本地資料夾。

繼上篇文章"PTT爬蟲 – Marvel篇"，本篇介紹怎麼爬取表特板的圖片，並下載至本地資料夾。

讀取表特板與marvel板最大的不同除了一個是存圖片一個是文字之外，還有就是表特板需要紀錄滿18歲的cookie，所以在發送request時，需把cookie "over18"的值設為"1"。

引用python套件

BeautifulSoup
用於解析html DOM
pip3 install beautifulsoup4
requests
用於對網址發出http requests
pip3 install requests
urllib
透過URL下載圖片。
pip3 install urllib3

程式碼

架構
程式碼解析
Demo

架構

st=>start: 程式進入點
op1=>operation: 讀取PTT Beauty首頁
op2=>operation: 設定讀取頁數
op3=>operation: 解析HTML DOM
op4=>operation: 儲存圖片
op5=>operation: 前往下一頁URL
cond=>condition: 是否最後一頁?
e=>end: 程式結束

st->op1->op2->op3->op4->cond
cond(no)->op5->op3
cond(yes)->e

st=>start: 程式進入點

op1=>operation: 讀取PTT Beauty首頁

op2=>operation: 設定讀取頁數

op3=>operation: 解析HTML DOM

op4=>operation: 儲存圖片

op5=>operation: 前往下一頁URL

cond=>condition: 是否最後一頁?

e=>end: 程式結束

st->op1->op2->op3->op4->cond

cond(no)->op5->op3

cond(yes)->e

程式碼解析

from bs4 import BeautifulSoup
import requests
import urllib.request
import re # 正規表示
import os # 操作系統資料夾、檔案

def main():
    # 頁面URL
    URL_now = "https://www.ptt.cc/bbs/beauty/index.html"
    # 跑的次數
    targetLoop = 1000
    countLoop = 1   
    while countLoop<=targetLoop:
        print("Page: " + str(countLoop) + "/ " + str(targetLoop))
        returnURL = parsePage(URL_now)
        URL_now = returnURL
        countLoop+=1

""" 找爆文 存入list """
def parsePage(pageURL):
    res = requests.get(pageURL, cookies={'over18': '1'})
    soup = BeautifulSoup(res.text, 'html.parser')
    articleList = findAllTitle(soup)
    href_list = list()
    for div in articleList:
        try:
            if (((div.find('div', class_='title').text.split('['))[1].split(']'))[0]) != "創作" and (((div.find('div', class_='title').text.split('['))[1].split(']'))[0]) != "公告":
                if ('爆' == div.find('div', class_='nrec').text):
                    content_list = list()
                    content_list.append(div.find('div', class_='title').text.split('\n')[1])
                    content_list.append(div.find('div', class_='title').a.get('href'))
                    href_list.append(content_list)
                elif (int(div.find('div', class_='nrec').text) >= 85):
                    content_list = list()
                    content_list.append(div.find('div', class_='title').text.split('\n')[1])
                    content_list.append(div.find('div', class_='title').a.get('href'))
                    href_list.append(content_list)
        except:
            try:
                if (((div.find('div', class_='title').text.split('［'))[1].split(']'))[0]) != "創作" and (((div.find('div', class_='title').text.split('['))[1].split(']'))[0]) != "公告":
                    if ('爆' == div.find('div', class_='nrec').text):
                        content_list = list()
                        content_list.append(div.find('div', class_='title').text.split('\n')[1])
                        content_list.append(div.find('div', class_='title').a.get('href'))
                        href_list.append(content_list)
                    elif (int(div.find('div', class_='nrec').text) >= 85):
                        content_list = list()
                        content_list.append(div.find('div', class_='title').text.split('\n')[1])
                        content_list.append(div.find('div', class_='title').a.get('href'))
                        href_list.append(content_list)
            except:
                pass

    for item in href_list:
        loadArticle(item)

    # 前往下一頁
    nextURL = findNextPageURL(soup)
    URL_now = "https://www.ptt.cc" + nextURL
    return URL_now

""" 下一頁URL """
def findNextPageURL(HTMLdata):
    return (HTMLdata.find('div', id='action-bar-container').find_all('a'))[3].get('href')

""" 取當前頁面所有文章列 """
def findAllTitle(HTMLdata):
    # data.find_all('div', class_='title')
    rows = HTMLdata.find_all('div', class_='r-ent')
    return rows

""" 讀取爆文內文圖檔連結 存為jpg """
def loadArticle(data):
    print(data)
    title = fixFilePath(data[0])
    URL = "https://www.ptt.cc" + data[1]
    res = requests.get(URL, cookies={'over18': '1'})
    soup = BeautifulSoup(res.text, 'html.parser')
    img_urls = list()
    for link in (soup.find('div', id='main-content').find_all('a')):
        if re.match(r'^https?://(i.)?(m.)?imgur.com', link.get('href')):
            img_urls.append(link.get('href'))
    # save file
    save(img_urls, title)

""" Save image """
def save(img_urls, title):
    if img_urls:
        try:
            folder_name = "./beauty/" + title.strip()
            os.makedirs(folder_name)
            for img_url in img_urls:
                # e.g. 'http://imgur.com/9487qqq.jpg'.split('//') -> ['http:', 'imgur.com/9487qqq.jpg']
                if img_url.split('//')[1].startswith('m.'):
                    img_url = img_url.replace('//m.', '//i.')
                if not img_url.split('//')[1].startswith('i.'):
                    img_url = img_url.split('//')[0] + '//i.' + img_url.split('//')[1]
                if not img_url.endswith('.jpg'):
                    img_url += '.jpg'
                file_name = img_url.split('/')[-1]
                urllib.request.urlretrieve(img_url, os.path.join(folder_name, file_name))
        except Exception as e:
            print(e)

""" Fix the illegal file path charactors """
def fixFilePath(oriPath):
    title = oriPath.replace('/', ' ')
    title = title.replace('\\', ' ')
    title = title.replace('"', ' ')
    title = title.replace('*', ' ')
    title = title.replace('?', ' ')
    title = title.replace(':', ' ')
    title = title.replace(';', ' ')
    title = title.replace('|', ' ')
    title = title.replace(',', ' ')
    title = title.replace('<', ' ')
    title = title.replace('>', ' ')
    return title

if __name__ == "__main__":
    main()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

from bs4 import BeautifulSoup

import requests

import urllib.request

import re # 正規表示

import os # 操作系統資料夾、檔案

def main():

# 頁面URL

URL_now = "https://www.ptt.cc/bbs/beauty/index.html"

# 跑的次數

targetLoop = 1000

countLoop = 1

while countLoop<=targetLoop:

print("Page: " + str(countLoop) + "/ " + str(targetLoop))

returnURL = parsePage(URL_now)

URL_now = returnURL

countLoop+=1

""" 找爆文存入list """

def parsePage(pageURL):

res = requests.get(pageURL, cookies={'over18': '1'})

soup = BeautifulSoup(res.text, 'html.parser')

articleList = findAllTitle(soup)

href_list = list()

for div in articleList:

try:

if (((div.find('div', class_='title').text.split('['))[1].split(']'))[0]) != "創作" and (((div.find('div', class_='title').text.split('['))[1].split(']'))[0]) != "公告":

if ('爆' == div.find('div', class_='nrec').text):

content_list = list()

content_list.append(div.find('div', class_='title').text.split('\n')[1])

content_list.append(div.find('div', class_='title').a.get('href'))

href_list.append(content_list)

elif (int(div.find('div', class_='nrec').text) >= 85):

content_list = list()

content_list.append(div.find('div', class_='title').text.split('\n')[1])

content_list.append(div.find('div', class_='title').a.get('href'))

href_list.append(content_list)

except:

try:

if (((div.find('div', class_='title').text.split('［'))[1].split(']'))[0]) != "創作" and (((div.find('div', class_='title').text.split('['))[1].split(']'))[0]) != "公告":

if ('爆' == div.find('div', class_='nrec').text):

content_list = list()

content_list.append(div.find('div', class_='title').text.split('\n')[1])

content_list.append(div.find('div', class_='title').a.get('href'))

href_list.append(content_list)

elif (int(div.find('div', class_='nrec').text) >= 85):

content_list = list()

content_list.append(div.find('div', class_='title').text.split('\n')[1])

content_list.append(div.find('div', class_='title').a.get('href'))

href_list.append(content_list)

except:

pass

for item in href_list:

loadArticle(item)

# 前往下一頁

nextURL = findNextPageURL(soup)

URL_now = "https://www.ptt.cc" + nextURL

return URL_now

""" 下一頁URL """

def findNextPageURL(HTMLdata):

return (HTMLdata.find('div', id='action-bar-container').find_all('a'))[3].get('href')

""" 取當前頁面所有文章列 """

def findAllTitle(HTMLdata):

# data.find_all('div', class_='title')

rows = HTMLdata.find_all('div', class_='r-ent')

return rows

""" 讀取爆文內文圖檔連結存為jpg """

def loadArticle(data):

print(data)

title = fixFilePath(data[0])

URL = "https://www.ptt.cc" + data[1]

res = requests.get(URL, cookies={'over18': '1'})

soup = BeautifulSoup(res.text, 'html.parser')

img_urls = list()

for link in (soup.find('div', id='main-content').find_all('a')):

if re.match(r'^https?://(i.)?(m.)?imgur.com', link.get('href')):

img_urls.append(link.get('href'))

# save file

save(img_urls, title)

""" Save image """

def save(img_urls, title):

if img_urls:

try:

folder_name = "./beauty/" + title.strip()

os.makedirs(folder_name)

for img_url in img_urls:

# e.g. 'http://imgur.com/9487qqq.jpg'.split('//') -> ['http:', 'imgur.com/9487qqq.jpg']

if img_url.split('//')[1].startswith('m.'):

img_url = img_url.replace('//m.', '//i.')

if not img_url.split('//')[1].startswith('i.'):

img_url = img_url.split('//')[0] + '//i.' + img_url.split('//')[1]

if not img_url.endswith('.jpg'):

img_url += '.jpg'

file_name = img_url.split('/')[-1]

urllib.request.urlretrieve(img_url, os.path.join(folder_name, file_name))

except Exception as e:

print(e)

""" Fix the illegal file path charactors """

def fixFilePath(oriPath):

title = oriPath.replace('/', ' ')

title = title.replace('\\', ' ')

title = title.replace('"', ' ')

title = title.replace('*', ' ')

title = title.replace('?', ' ')

title = title.replace(':', ' ')

title = title.replace(';', ' ')

title = title.replace('|', ' ')

title = title.replace(',', ' ')

title = title.replace('<', ' ')

title = title.replace('>', ' ')

return title

if __name__ == "__main__":

main()

Function解析

– def parsePage(pageURL)

讀取文章列表HTML DOM，取得人氣為 "爆" 或是大於 85 推以上的文章。
將文章標題及文章URL存入文章list。
呼叫loadArticle，分別處理list中各文章內容。

讀取文章列表HTML DOM，取得人氣為 "爆" 或是大於 85 推以上的文章。

將文章標題及文章URL存入文章list。

呼叫loadArticle，分別處理list中各文章內容。

– def findNextPageURL(HTMLdata)

此function將回傳下一頁的URL。

1	此function將回傳下一頁的URL。

– def findAllTitle(HTMLdata)

此function將回傳當前文章列表所有文章標題，唯一list。

1	此function將回傳當前文章列表所有文章標題，唯一list。

– def loadArticle(data)

parse文章內容，儲存於txt檔案。

1	parse文章內容，儲存於txt檔案。

– def save(img_urls, title)

儲存圖片為jpg。

儲存圖片為jpg。

– def fixFilePath(oriPath)

處理txt檔案名稱，針對windows不合法的檔案名稱字元進行replace。

1	處理txt檔案名稱，針對windows不合法的檔案名稱字元進行replace。

Demo

資料夾內容
圖片內容

[python] PTT爬蟲 – Beauty(表特)篇

繼上篇文章"PTT爬蟲 – Marvel篇"，本篇介紹怎麼爬取表特板的圖片，並下載至本地資料夾。

讀取表特板與marvel板最大的不同除了一個是存圖片一個是文字之外，還有就是表特板需要紀錄滿18歲的cookie，所以在發送request時，需把cookie "over18"的值設為"1"。

引用python套件

程式碼

架構

程式碼解析

Function解析

– def parsePage(pageURL)

– def findNextPageURL(HTMLdata)

– def findAllTitle(HTMLdata)

– def loadArticle(data)

– def save(img_urls, title)

– def fixFilePath(oriPath)

Demo

留言

撰寫回覆或留言取消回覆

[python] PTT爬蟲 – Beauty(表特)篇

繼上篇文章"PTT爬蟲 – Marvel篇"，本篇介紹怎麼爬取表特板的圖片，並下載至本地資料夾。

讀取表特板與marvel板最大的不同除了一個是存圖片一個是文字之外，還有就是表特板需要紀錄滿18歲的cookie，所以在發送request時，需把cookie "over18"的值設為"1"。

引用python套件

程式碼

架構

程式碼解析

Function解析

– def parsePage(pageURL)

– def findNextPageURL(HTMLdata)

– def findAllTitle(HTMLdata)

– def loadArticle(data)

– def save(img_urls, title)

– def fixFilePath(oriPath)

Demo

留言

撰寫回覆或留言 取消回覆

撰寫回覆或留言取消回覆