専門ユニット2/山内研セミナー(2023/11/22)

関連サイトと資料

スクレイピングで青空文庫からデータを取得してみよう

関連リンク

プログラム

from bs4 import BeautifulSoup
from urllib import request
  
use_proxy = False # proxy使用時はTrue
  
url = 'https://www.aozora.gr.jp/cards/000074/files/427_19793.html'
req = request.Request(url)
if use_proxy == True:
    req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'http')
    req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'https')
  
with request.urlopen(req) as res:
    soup = BeautifulSoup(res)
    print(soup)
    

main_text = soup.find('div', class_='main_text')
print(main_text)
    

tags_to_delete = main_text.find_all(['rp', 'rt'])
for tag in tags_to_delete:
    tag.decompose()
print(main_text)
    

main_text = main_text.get_text()
print(main_text)
    

main_text = main_text.replace('\r', '').replace('\n', '').replace('\u3000', '')
main_text
    

import re
  
main_text = re.sub('([!。])', r'\1\n', main_text)  # 。と!で改行
text_list = main_text.splitlines()
print(text_list)
    

def get_data(url, use_proxy):
    req = request.Request(url)
    if use_proxy == True:
        req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'http')
        req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'https')
  
    with request.urlopen(req) as res:
        soup = BeautifulSoup(res)
  
    main_text = soup.find('div', class_='main_text')
    tags_to_delete = main_text.find_all(['rp', 'rt'])
    for tag in tags_to_delete:
        tag.decompose()
    main_text = main_text.get_text()
    main_text = main_text.replace('\r', '').replace('\n', '').replace('\u3000', '')
    main_text = re.sub('([!。])', r'\1\n', main_text)
    text_list = main_text.splitlines()
  
    return text_list
    

url = 'https://www.aozora.gr.jp/cards/000074/files/427_19793.html'
text_list = get_data(url, False)
print(text_list)
    

url = 'https://www.aozora.gr.jp/cards/000074/files/429_19794.html'
text_list = get_data(url, False)
print(text_list)
    

def get_data(url, use_proxy):
    req = request.Request(url)
    if use_proxy == True:
        req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'http')
        req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'https')
  
    with request.urlopen(req) as res:
        soup = BeautifulSoup(res)
  
    main_text = soup.find('div', class_='main_text')
    tags_to_delete = main_text.find_all(['rp', 'rt', 'h4'])
    for tag in tags_to_delete:
        tag.decompose()
    main_text = main_text.get_text()
    main_text = main_text.replace('\r', '').replace('\n', '').replace('\u3000', '')
    main_text = main_text.replace('「', '').replace('」', '\n')
    main_text = re.sub('([!。])', r'\1\n', main_text)
    main_text.replace('\n\n', '\n')
    text_list = main_text.splitlines()
    
    return text_list
    

url = 'https://www.aozora.gr.jp/cards/000074/files/429_19794.html'
text_list = get_data(url, False)
print(text_list)
    

url_list = ['https://www.aozora.gr.jp/cards/000074/files/427_19793.html',
    'https://www.aozora.gr.jp/cards/000074/files/429_19794.html']
text_list = []
  
for url in url_list:
    text_list.extend(get_data(url, False))
    

base_url = 'https://www.aozora.gr.jp/index_pages/'
author = 'person74.html'  # 梶井基次郎
  
use_proxy = False # proxy使用時はTrue
  
url = base_url + author
req = request.Request(url)
if use_proxy == True:
    req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'http')
    req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'https')
  
with request.urlopen(req) as res:
    soup = BeautifulSoup(res)
    print(soup)
    

url_list = [item['href'] for item in soup.find('ol').find_all('a')]
title_page_url = base_url + url_list[0]
print(title_page_url)
    

use_proxy = True # proxy使用時はTrue
  
req = request.Request(title_page_url)
if use_proxy == True:
    req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'http')
    req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'https')
  
with request.urlopen(req) as res:
    title_page = BeautifulSoup(res)
  
print(title_page)
    

read_xhtml_div = title_page.find_all('div', align='right')[1]
html_path = read_xhtml_div.find_all('a')[1]['href']
title_page_url = re.sub('card\d+\.html', '', title_page_url)
print(title_page_url)
get_data(title_page_url + html_path, False)
    

import time
  
def get_all_title_data(use_proxy):
    base_url = 'https://www.aozora.gr.jp/index_pages/'
    author = 'person74.html'
    req = request.Request(base_url + author)
    if use_proxy == True:
        req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'http')
        req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'https')
    with request.urlopen(req) as res:
        author_page = BeautifulSoup(res)
  
    text_list = []
    url_list = [item['href'] for item in author_page.find('ol').find_all('a')]
    for url in url_list:
        title_page_url = base_url + url
        req = request.Request(title_page_url)
        if use_proxy == True:
            req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'http')
            req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'https')
        with request.urlopen(req) as res:
            title_page = BeautifulSoup(res)
  
        read_xhtml_div = title_page.find_all('div', align='right')[1]
        html_path = read_xhtml_div.find_all('a')[1]['href']
        title_page_url = re.sub('card\d+\.html', '', title_page_url)
        result = get_data(title_page_url + html_path, use_proxy)
        text_list.extend(result)
        time.sleep(5)
  
    return text_list
    

text_list = get_all_title_data(False)