ハローブログの画像のバックアップを試しました。
こんな感じです。
ソースコードはこんな感じです。
# -*- Coding: utf-8 -*- from bs4 import BeautifulSoup import requests import re import uuid import openpyxl import urllib.request from openpyxl import Workbook class rlist: def get_xls(self,fname): url_1 = "https://c2c2c2c2.hatenablog.com/archive?" url_2 = "page=1" headers = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0", } url = url_1 + url_2 pa = 'page=' wb = Workbook() ws = wb.active cell = ws.cell(row=1,column=1) cell.value = 'title name' cell = ws.cell(row=1,column=2) cell.value = 'url' cell = ws.cell(row=1,column=3) cell.value = 'pics' index = 2 page = 1 pi = 0 pageMax = 100 for j in range(pageMax): paa = pa+str(j+1) urli = url.replace('page=1',paa) print( urli) p = requests.get(url=urli, headers=headers) content_type_encoding = p.encoding if p.encoding != 'ISO-8859-1' else None bs =BeautifulSoup(p.content,"html.parser",from_encoding=content_type_encoding) if bs.find_all(string="記事はありません") is None : print ("break") break tname=bs.find_all(attrs={"class": "entry-title-link"}) l=len(tname) if l is 0 : print ("break") break for i in range(l): cell = ws.cell(row=i+index,column=1) cellp = ws.cell(row=i+index,column=2) cell.value = tname[i].get_text() cellp.value = tname[i].get("href") print(page, i+index,i,index,pi, cell.value,cellp.value ) p1 = requests.get(url=tname[i].get("href"), headers=headers) bs1 = BeautifulSoup(p1.text,'lxml') #imgs = bs1.find_all('img') imgs = bs1.find_all('img',src=re.compile('^https://cdn-ak.f.st-hatena.com/images/fotolife/c/c2c2c2c2')) if len(imgs) is not 0 : print("img is not 0",len(imgs)) for img in imgs: cellpi = ws.cell(row=i+index+pi,column=3) cellpi.value = img['src'] pfname = cellpi.value pfnamea = pfname.replace('https://cdn-ak.f.st-hatena.com/images/fotolife/c/c2c2c2c2','') pfnameb = pfnamea.replace('/','_') print(page, i+index+pi,i,index,pi,img['src']) r = requests.get(img['src']) with open(str('picture/')+str(pfnameb),'wb') as file: file.write(r.content) pi =pi +1 index = index +pi -1 pi = 0 index = index + l page = page +1 pi = 0 wb.save(filename = fname) k = u'ごろねこサミット' rlist.get_xls(k,'sample_book2.xlsx')