ハロー ブログ

日々のつぶやき @c2c2c2c221

pythonを試しました。その7

ハローブログの画像のバックアップを試しました。

こんな感じです。
f:id:c2c2c2c2:20190302164555j:plain
f:id:c2c2c2c2:20190302164806j:plain

ソースコードはこんな感じです。

# -*- Coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import re
import uuid
import openpyxl
import urllib.request
from openpyxl import Workbook

class rlist:
    def get_xls(self,fname):
        url_1 = "https://c2c2c2c2.hatenablog.com/archive?"
        url_2 = "page=1"
        headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
        }
        url = url_1 + url_2
        pa = 'page='
        wb = Workbook()
        ws = wb.active
        cell = ws.cell(row=1,column=1)
        cell.value = 'title name'
        cell = ws.cell(row=1,column=2)
        cell.value = 'url'
        cell = ws.cell(row=1,column=3)
        cell.value = 'pics'
        index = 2
        page = 1
        pi = 0
        pageMax = 100
        for j in range(pageMax):
            paa = pa+str(j+1)
            urli = url.replace('page=1',paa)
            print( urli)
            p = requests.get(url=urli, headers=headers)
            content_type_encoding = p.encoding if p.encoding != 'ISO-8859-1' else None
            bs =BeautifulSoup(p.content,"html.parser",from_encoding=content_type_encoding)
            if bs.find_all(string="記事はありません") is None :
                print ("break")
                break
            tname=bs.find_all(attrs={"class": "entry-title-link"})
            l=len(tname)
            if l is 0 :
                print ("break")
                break
            for i in range(l):
                cell = ws.cell(row=i+index,column=1)
                cellp = ws.cell(row=i+index,column=2)
                cell.value = tname[i].get_text()
                cellp.value = tname[i].get("href")
                print(page, i+index,i,index,pi, cell.value,cellp.value )
                p1 = requests.get(url=tname[i].get("href"), headers=headers)
                bs1 = BeautifulSoup(p1.text,'lxml')
                #imgs = bs1.find_all('img')
                imgs = bs1.find_all('img',src=re.compile('^https://cdn-ak.f.st-hatena.com/images/fotolife/c/c2c2c2c2'))
                if len(imgs) is not 0 :
                    print("img is not 0",len(imgs))
                    for img in imgs:
                        cellpi = ws.cell(row=i+index+pi,column=3)
                        cellpi.value = img['src']
                        pfname = cellpi.value
                        pfnamea = pfname.replace('https://cdn-ak.f.st-hatena.com/images/fotolife/c/c2c2c2c2','')
                        pfnameb = pfnamea.replace('/','_')
                        print(page, i+index+pi,i,index,pi,img['src'])
                        r = requests.get(img['src'])
                        with open(str('picture/')+str(pfnameb),'wb') as file:
                            file.write(r.content)
                        pi =pi +1
                    index = index +pi -1
                pi = 0
            index = index + l
            page = page +1
            pi = 0
        wb.save(filename = fname)

k = u'ごろねこサミット'
rlist.get_xls(k,'sample_book2.xlsx')