ハロー ブログ

日々のつぶやき @c2c2c2c221

pythonを試しました。その6

ハローブログの画像のバックアップを試しました。

こんな感じです。

f:id:c2c2c2c2:20190302164555j:plain

f:id:c2c2c2c2:20190302164806j:plain

 

 

 

 

 

 

# -*- Coding: utf-8 -*
from bs4 import BeautifulSoup
import requests
import re
import uuid
import openpyxl
import urllib.request
from openpyxl import Workbook
 
class rlist:
def get_xls(self,fname):
url_2 = "page=1"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
}
url = url_1 + url_2
pa = 'page='
wb = Workbook()
ws = wb.active
cell = ws.cell(row=1,column=1)
cell.value = 'title name'
cell = ws.cell(row=1,column=2)
cell.value = 'url'
cell = ws.cell(row=1,column=3)
cell.value = 'pics'
index = 2
page = 1
pi = 0 
pageMax = 100
for j in range(pageMax):
paa = pa+str(j+1)
urli = url.replace('page=1',paa)
print( urli)
p = requests.get(url=urli, headers=headers)
content_type_encoding = p.encoding if p.encoding != 'ISO-8859-1' else None
bs =BeautifulSoup(p.content,"html.parser",from_encoding=content_type_encoding)
if bs.find_all(string="記事はありません") is None :
print ("break")
break
tname=bs.find_all(attrs={"class": "entry-title-link"})
l=len(tname)
if l is 0 :
print ("break")
break
for i in range(l):
cell = ws.cell(row=i+index,column=1)
cellp = ws.cell(row=i+index,column=2)
cell.value = tname[i].get_text()
cellp.value = tname[i].get("href")
print(page, i+index,i,index,pi, cell.value,cellp.value )
p1 = requests.get(url=tname[i].get("href"), headers=headers)
bs1 = BeautifulSoup(p1.text,'lxml')
#imgs = bs1.find_all('img')
if len(imgs) is not 0 :
print("img is not 0",len(imgs))
for img in imgs:
cellpi = ws.cell(row=i+index+pi,column=3)
cellpi.value = img['src']
pfname = cellpi.value
pfnameb = pfnamea.replace('/','_')
print(page, i+index+pi,i,index,pi,img['src'])
r = requests.get(img['src'])
with open(str('picture/')+str(pfnameb),'wb') as file:
file.write(r.content)
pi =pi +1
index = index +pi -1
pi = 0 

index = index + l

page = page +1
pi = 0 

wb.save(filename = fname)

k = u'ごろねこサミット'
rlist.get_xls(k,'sample_book2.xlsx')