専門ユニット2/山内研セミナー(2023/11/22)
関連サイトと資料
スクレイピングで青空文庫からデータを取得してみよう
関連リンク
プログラム
from bs4 import BeautifulSoup
from urllib import request
use_proxy = False # proxy使用時はTrue
url = 'https://www.aozora.gr.jp/cards/000074/files/427_19793.html'
req = request.Request(url)
if use_proxy == True:
req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'http')
req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'https')
with request.urlopen(req) as res:
soup = BeautifulSoup(res)
print(soup)
main_text = soup.find('div', class_='main_text')
print(main_text)
tags_to_delete = main_text.find_all(['rp', 'rt'])
for tag in tags_to_delete:
tag.decompose()
print(main_text)
main_text = main_text.get_text()
print(main_text)
main_text = main_text.replace('\r', '').replace('\n', '').replace('\u3000', '')
main_text
import re
main_text = re.sub('([!。])', r'\1\n', main_text) # 。と!で改行
text_list = main_text.splitlines()
print(text_list)
def get_data(url, use_proxy):
req = request.Request(url)
if use_proxy == True:
req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'http')
req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'https')
with request.urlopen(req) as res:
soup = BeautifulSoup(res)
main_text = soup.find('div', class_='main_text')
tags_to_delete = main_text.find_all(['rp', 'rt'])
for tag in tags_to_delete:
tag.decompose()
main_text = main_text.get_text()
main_text = main_text.replace('\r', '').replace('\n', '').replace('\u3000', '')
main_text = re.sub('([!。])', r'\1\n', main_text)
text_list = main_text.splitlines()
return text_list
url = 'https://www.aozora.gr.jp/cards/000074/files/427_19793.html'
text_list = get_data(url, False)
print(text_list)
url = 'https://www.aozora.gr.jp/cards/000074/files/429_19794.html'
text_list = get_data(url, False)
print(text_list)
def get_data(url, use_proxy):
req = request.Request(url)
if use_proxy == True:
req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'http')
req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'https')
with request.urlopen(req) as res:
soup = BeautifulSoup(res)
main_text = soup.find('div', class_='main_text')
tags_to_delete = main_text.find_all(['rp', 'rt', 'h4'])
for tag in tags_to_delete:
tag.decompose()
main_text = main_text.get_text()
main_text = main_text.replace('\r', '').replace('\n', '').replace('\u3000', '')
main_text = main_text.replace('「', '').replace('」', '\n')
main_text = re.sub('([!。])', r'\1\n', main_text)
main_text.replace('\n\n', '\n')
text_list = main_text.splitlines()
return text_list
url = 'https://www.aozora.gr.jp/cards/000074/files/429_19794.html'
text_list = get_data(url, False)
print(text_list)
url_list = ['https://www.aozora.gr.jp/cards/000074/files/427_19793.html',
'https://www.aozora.gr.jp/cards/000074/files/429_19794.html']
text_list = []
for url in url_list:
text_list.extend(get_data(url, False))
base_url = 'https://www.aozora.gr.jp/index_pages/'
author = 'person74.html' # 梶井基次郎
use_proxy = False # proxy使用時はTrue
url = base_url + author
req = request.Request(url)
if use_proxy == True:
req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'http')
req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'https')
with request.urlopen(req) as res:
soup = BeautifulSoup(res)
print(soup)
url_list = [item['href'] for item in soup.find('ol').find_all('a')]
title_page_url = base_url + url_list[0]
print(title_page_url)
use_proxy = True # proxy使用時はTrue
req = request.Request(title_page_url)
if use_proxy == True:
req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'http')
req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'https')
with request.urlopen(req) as res:
title_page = BeautifulSoup(res)
print(title_page)
read_xhtml_div = title_page.find_all('div', align='right')[1]
html_path = read_xhtml_div.find_all('a')[1]['href']
title_page_url = re.sub('card\d+\.html', '', title_page_url)
print(title_page_url)
get_data(title_page_url + html_path, False)
import time
def get_all_title_data(use_proxy):
base_url = 'https://www.aozora.gr.jp/index_pages/'
author = 'person74.html'
req = request.Request(base_url + author)
if use_proxy == True:
req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'http')
req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'https')
with request.urlopen(req) as res:
author_page = BeautifulSoup(res)
text_list = []
url_list = [item['href'] for item in author_page.find('ol').find_all('a')]
for url in url_list:
title_page_url = base_url + url
req = request.Request(title_page_url)
if use_proxy == True:
req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'http')
req.set_proxy('http://ccproxyz.kanagawa-itacjp:10080', 'https')
with request.urlopen(req) as res:
title_page = BeautifulSoup(res)
read_xhtml_div = title_page.find_all('div', align='right')[1]
html_path = read_xhtml_div.find_all('a')[1]['href']
title_page_url = re.sub('card\d+\.html', '', title_page_url)
result = get_data(title_page_url + html_path, use_proxy)
text_list.extend(result)
time.sleep(5)
return text_list
text_list = get_all_title_data(False)