pythonを試しました。その6

ハローブログの画像のバックアップを試しました。

こんな感じです。

f:id:c2c2c2c2:20190302164555j:plain

f:id:c2c2c2c2:20190302164806j:plain

# -*- Coding: utf-8 -*

from bs4 import BeautifulSoup

import requests

import re

import uuid

import openpyxl

import urllib.request

from openpyxl import Workbook

class rlist:

def get_xls(self,fname):

url_1 = "https://c2c2c2c2.hatenablog.com/archive?"

url_2 = "page=1"

headers = {

"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",

}

url = url_1 + url_2

pa = 'page='

wb = Workbook()

ws = wb.active

cell = ws.cell(row=1,column=1)

cell.value = 'title name'

cell = ws.cell(row=1,column=2)

cell.value = 'url'

cell = ws.cell(row=1,column=3)

cell.value = 'pics'

index = 2

page = 1

pi = 0

pageMax = 100

for j in range(pageMax):

paa = pa+str(j+1)

urli = url.replace('page=1',paa)

print( urli)

p = requests.get(url=urli, headers=headers)

content_type_encoding = p.encoding if p.encoding != 'ISO-8859-1' else None

bs =BeautifulSoup(p.content,"html.parser",from_encoding=content_type_encoding)

if bs.find_all(string="記事はありません") is None :

print ("break")

break

tname=bs.find_all(attrs={"class": "entry-title-link"})

l=len(tname)

if l is 0 :

print ("break")

break

for i in range(l):

cell = ws.cell(row=i+index,column=1)

cellp = ws.cell(row=i+index,column=2)

cell.value = tname[i].get_text()

cellp.value = tname[i].get("href")

print(page, i+index,i,index,pi, cell.value,cellp.value )

p1 = requests.get(url=tname[i].get("href"), headers=headers)

bs1 = BeautifulSoup(p1.text,'lxml')

#imgs = bs1.find_all('img')

imgs = bs1.find_all('img',src=re.compile('^https://cdn-ak.f.st-hatena.com/images/fotolife/c/c2c2c2c2'))

if len(imgs) is not 0 :

print("img is not 0",len(imgs))

for img in imgs:

cellpi = ws.cell(row=i+index+pi,column=3)

cellpi.value = img['src']

pfname = cellpi.value

pfnamea = pfname.replace('https://cdn-ak.f.st-hatena.com/images/fotolife/c/c2c2c2c2','')

pfnameb = pfnamea.replace('/','_')

print(page, i+index+pi,i,index,pi,img['src'])

r = requests.get(img['src'])

with open(str('picture/')+str(pfnameb),'wb') as file:

file.write(r.content)

pi =pi +1

index = index +pi -1

pi = 0

index = index + l

page = page +1

pi = 0

wb.save(filename = fname)

k = u'ごろねこサミット'

rlist.get_xls(k,'sample_book2.xlsx')