Iwara爬虫，科学GHS

又闲得慌，写个Iwara爬虫，欢迎大佬指点

复制代码

import requests
from lxml import etree
import os
import re
from subprocess import call
#################################
# 函数定义
# 创建文件夹
def mkdir(path):
# 判断是否存在文件夹如果不存在则创建为文件夹
# 如果路径不存在会创建这个路径
folder = os.path.exists(path)
if not folder:
os.makedirs(path)
# 使用IDM下载资源
def idm_dld(idm_url, idm_path, idm_name):
IDMPath = "C:\\Green\\IDM 6.36.5"
os.chdir(IDMPath)
IDM = "IDMan.exe"
call([IDM, '/d', idm_url, '/p', idm_path, '/f', idm_name, '/a'])
call([IDM, '/s'])
# 提取视频源
def get_source_url(url):
url_api = url.split('videos', 1)[0] + 'api/video' + url.split('videos', 1)[1]
response = requests.get(url_api)
js = response.json()
for item in js:
if item['resolution'] == 'Source':
source_url = 'https:' + item['uri']
return source_url
# 从页面提取视频网址
def get_urls(search_url):
if search_url == '':
return []
requests.get(search_url)
web_urls = etree.HTML(requests.get(search_url).text).xpath('//div[@class="field-item even"]/a/@href')
for i in range(len(web_urls)):
web_urls[i] = 'https://ecchi.iwara.tv' + web_urls[i]
return web_urls
# 下载多个网页的视频
# 在expire_year-expire_month之前的视频不会下载
def videos_dld(web_urls, dld_path, expire_year, expire_month):
# 对每个网址
for i in range(len(web_urls)):
print(str(i + 1) + '/' + str(len(web_urls)) + '\t' + web_urls[i])
if web_urls[i].find('videos') >= 0:
source_url = get_source_url(web_urls[i])
if source_url is None:
continue
file_name = re.search('file=(.*?)&op', source_url).group(1)
author = etree.HTML(requests.get(web_urls[i]).text).xpath('//a[@class="username"]/text()')[0]
print('\t' + 'Author: ' + author)
print('\t' + source_url)
file_name = file_name.replace('%2F', '-')
print('\t' + 'New name: ' + file_name)
file_path = dld_path + '\\' + author
mkdir(file_path)
local_name = file_path + '\\' + file_name
old_file_name = file_name.split('-')[3]
print('\t' + 'Old name: ' + old_file_name)
old_local_name = file_path + '\\' + old_file_name
# 判断日期
if ((int(file_name.split('-')[0]) == expire_year) & (int(file_name.split('-')[1]) >= expire_month) | (int(file_name.split('-')[0]) > expire_year)):
# 下载
if (not os.path.isfile(local_name)) & (not os.path.isfile(old_local_name)):
print('\tState: no such file, to be downloaded')
idm_dld(source_url, file_path, file_name)
elif os.path.isfile(old_local_name) & os.path.isfile(local_name):
print('\tState: old and new file exist')
os.remove(old_local_name)
elif os.path.isfile(old_local_name):
print('\tState: old file exist')
os.rename(old_local_name, local_name)
else:
print('\tState: new file exist')
else:
print('\tEarly date')
break
# 整合版
def iwara_dld(dld_path, search_url, web_urls, expire_year, expire_month):
search_urls = get_urls(search_url)
# print('Download videos from search page')
videos_dld(search_urls, dld_path, expire_year, expire_month)
# print('Download videos directly from urls')
videos_dld(web_urls, dld_path, 2000, 0)
##################################
# 主程序
##################################
def main():
# 设置
# local_path = "D:\\Another\\temp\\ANOTHER_RUBBISH\\VIDEOS\\Iwara"
local_path = "C:\\Another\\temp\\iwara"
# 搜索页面URL
url = ''
# 视频网页.
urls = []
# 下载该日期之后的视频
year = 2020
month = 3
print('------------------------------------')
print('Iwara Downloader V1.0')
print('------------------------------------')
print('Default Settings:')
print('Download Directory: %s' % (local_path))
print('Year: %s' % str(year))
print('Month: %s' % str(month))
while True:
cmd = 0
try:
print('------------------------------------\n------------------------------------\nAvailable Functions：')
print('1.Download Video Directly From Link')
print('2.Download All Videos In The Searching Page')
print('3.Change Download Settings')
print('4.View Download Settings')
print('5.Quit')
cmd = int(input('\nChoose Desired Function：'))
except:
pass
if cmd == 1:
url = ''
urls = []
urls.append(input('\nDownload Video Directly From Link (Enter To Skip):'))
iwara_dld(local_path, url, urls, year, month)
elif cmd == 2:
url = ''
urls = []
url = input('\nDownload All Videos In The Searching Page (Enter To Skip):')
iwara_dld(local_path, url, urls, year, month)
elif cmd == 3:
try:
local_path = input('\nSet Download Directory:')
mkdir(local_path)
year = int(input('\nSet Year'))
month = int(input('\nSet Month'))
except:
local_path = "C:\\Another\\temp\\iwara"
year = 2020
month = 3
elif cmd == 4:
print('\nView Download Settings :')
print('Download Directory: %s' % (local_path))
print('Year: %s' % str(year))
print('Month: %s' % str(month))
elif cmd == 5:
print('\nQuit')
break
print('Finished\n')
if __name__ == '__main__':
main()

为加快下载速度调用了IDM，需要把IDM本体安装到“C:\Green\IDM 6.36.5”文件夹中，运行爬虫的时候需要保证IDM已经打开。

度盘IDM链接：
链接: https://pan.baidu.com/s/1ZsjNK8kgrLHxb7AlNZ8gEw
提取码: 9msh
解压后运行“!绿化.bat”就行了

打包的.exe文件：
链接: https://pan.baidu.com/s/1FAoyJEOZHW_rl-xo9BtAFA
提取码: qkdm

使用截图：