none.gif

AsCenSion

GF  2021-04-16 18:05

Iwara爬虫,科学GHS

又闲得慌,写个Iwara爬虫,欢迎大佬指点

复制代码
  1. import requests
  2. from lxml import etree
  3. import os
  4. import re
  5. from subprocess import call
  6. #################################
  7. # 函数定义
  8. # 创建文件夹
  9. def mkdir(path):
  10.     # 判断是否存在文件夹如果不存在则创建为文件夹
  11.     # 如果路径不存在会创建这个路径
  12.     folder = os.path.exists(path)
  13.     if not folder:
  14.         os.makedirs(path)
  15. # 使用IDM下载资源
  16. def idm_dld(idm_url, idm_path, idm_name):
  17.     IDMPath = "C:\\Green\\IDM 6.36.5"
  18.     os.chdir(IDMPath)
  19.     IDM = "IDMan.exe"
  20.     call([IDM, '/d', idm_url, '/p', idm_path, '/f', idm_name, '/a'])
  21.     call([IDM, '/s'])
  22. # 提取视频源
  23. def get_source_url(url):
  24.     url_api = url.split('videos', 1)[0] + 'api/video' + url.split('videos', 1)[1]
  25.     response = requests.get(url_api)
  26.     js = response.json()
  27.     for item in js:
  28.         if item['resolution'] == 'Source':
  29.             source_url = 'https:' + item['uri']
  30.             return source_url
  31. # 从页面提取视频网址
  32. def get_urls(search_url):
  33.     if search_url == '':
  34.         return []
  35.     requests.get(search_url)
  36.     web_urls = etree.HTML(requests.get(search_url).text).xpath('//div[@class="field-item even"]/a/@href')
  37.     for i in range(len(web_urls)):
  38.         web_urls[i] = 'https://ecchi.iwara.tv' + web_urls[i]
  39.     return web_urls
  40. # 下载多个网页的视频
  41. # 在expire_year-expire_month之前的视频不会下载
  42. def videos_dld(web_urls, dld_path, expire_year, expire_month):
  43.     # 对每个网址
  44.     for i in range(len(web_urls)):
  45.         print(str(i + 1) + '/' + str(len(web_urls)) + '\t' + web_urls[i])
  46.         if web_urls[i].find('videos') >= 0:
  47.             source_url = get_source_url(web_urls[i])
  48.             if source_url is None:
  49.                 continue
  50.             file_name = re.search('file=(.*?)&op', source_url).group(1)
  51.             author = etree.HTML(requests.get(web_urls[i]).text).xpath('//a[@class="username"]/text()')[0]
  52.             print('\t' + 'Author: ' + author)
  53.             print('\t' + source_url)
  54.             file_name = file_name.replace('%2F', '-')
  55.             print('\t' + 'New name: ' + file_name)
  56.             file_path = dld_path + '\\' + author
  57.             mkdir(file_path)
  58.             local_name = file_path + '\\' + file_name
  59.             old_file_name = file_name.split('-')[3]
  60.             print('\t' + 'Old name: ' + old_file_name)
  61.             old_local_name = file_path + '\\' + old_file_name
  62.             # 判断日期
  63.             if ((int(file_name.split('-')[0]) == expire_year) & (int(file_name.split('-')[1]) >= expire_month) | (int(file_name.split('-')[0]) > expire_year)):
  64.                 # 下载
  65.                 if (not os.path.isfile(local_name)) & (not os.path.isfile(old_local_name)):
  66.                     print('\tState: no such file, to be downloaded')
  67.                     idm_dld(source_url, file_path, file_name)
  68.                 elif os.path.isfile(old_local_name) & os.path.isfile(local_name):
  69.                     print('\tState: old and new file exist')
  70.                     os.remove(old_local_name)
  71.                 elif os.path.isfile(old_local_name):
  72.                     print('\tState: old file exist')
  73.                     os.rename(old_local_name, local_name)
  74.                 else:
  75.                     print('\tState: new file exist')
  76.             else:
  77.                 print('\tEarly date')
  78.                 break
  79. # 整合版
  80. def iwara_dld(dld_path, search_url, web_urls, expire_year, expire_month):
  81.     search_urls = get_urls(search_url)
  82.     # print('Download videos from search page')
  83.     videos_dld(search_urls, dld_path, expire_year, expire_month)
  84.     # print('Download videos directly from urls')
  85.     videos_dld(web_urls, dld_path, 2000, 0)
  86. ##################################
  87. # 主程序
  88. ##################################
  89. def main():
  90.     # 设置
  91.     # local_path = "D:\\Another\\temp\\ANOTHER_RUBBISH\\VIDEOS\\Iwara"
  92.     local_path = "C:\\Another\\temp\\iwara"
  93.     # 搜索页面URL
  94.     url = ''
  95.     # 视频网页.
  96.     urls = []
  97.     # 下载该日期之后的视频
  98.     year = 2020
  99.     month = 3
  100.     print('------------------------------------')
  101.     print('Iwara Downloader V1.0')
  102.     print('------------------------------------')
  103.     print('Default Settings:')
  104.     print('Download Directory: %s' % (local_path))
  105.     print('Year: %s' % str(year))
  106.     print('Month: %s' % str(month))
  107.     while True:
  108.         cmd = 0
  109.         try:
  110.             print('------------------------------------\n------------------------------------\nAvailable Functions:')
  111.             print('1.Download Video Directly From Link')
  112.             print('2.Download All Videos In The Searching Page')
  113.             print('3.Change Download Settings')
  114.             print('4.View Download Settings')
  115.             print('5.Quit')
  116.             cmd = int(input('\nChoose Desired Function:'))
  117.         except:
  118.             pass
  119.         if cmd == 1:
  120.             url = ''
  121.             urls = []
  122.             urls.append(input('\nDownload Video Directly From Link (Enter To Skip):'))
  123.             iwara_dld(local_path, url, urls, year, month)
  124.         elif cmd == 2:
  125.             url = ''
  126.             urls = []
  127.             url = input('\nDownload All Videos In The Searching Page (Enter To Skip):')
  128.             iwara_dld(local_path, url, urls, year, month)
  129.         elif cmd == 3:
  130.             try:
  131.                 local_path = input('\nSet Download Directory:')
  132.                 mkdir(local_path)
  133.                 year = int(input('\nSet Year'))
  134.                 month = int(input('\nSet Month'))
  135.             except:
  136.                 local_path = "C:\\Another\\temp\\iwara"
  137.                 year = 2020
  138.                 month = 3
  139.         elif cmd == 4:
  140.             print('\nView Download Settings :')
  141.             print('Download Directory: %s' % (local_path))
  142.             print('Year: %s' % str(year))
  143.             print('Month: %s' % str(month))
  144.         elif cmd == 5:
  145.             print('\nQuit')
  146.             break
  147.         print('Finished\n')
  148. if __name__ == '__main__':
  149.     main()


为加快下载速度调用了IDM,需要把IDM本体安装到“C:\Green\IDM 6.36.5”文件夹中,运行爬虫的时候需要保证IDM已经打开。

度盘IDM链接:
链接: https://pan.baidu.com/s/1ZsjNK8kgrLHxb7AlNZ8gEw
提取码: 9msh
解压后运行“!绿化.bat”就行了

打包的.exe文件:
链接: https://pan.baidu.com/s/1FAoyJEOZHW_rl-xo9BtAFA
提取码: qkdm

使用截图:





欢迎打赏
此帖售价 0 SP币,已有 67 人购买
若发现会员采用欺骗的方法获取财富,请立刻举报,我们会对会员处以2-N倍的罚金,严重者封掉ID!

none.gif

dokidokidoki

B1F  2021-04-17 19:58
哈哈,和我一样,不面向对象
最近的实验都是用python写的,python真jb好用
可以加个qt5界面,命令行还是太low了,很简单的,   

none.gif

dokidokidoki

B2F  2021-04-17 20:58
在楼主的基础上加了个qt5的界面
看了一下,requests没有设置user-agent,我这里是爬取不到东西的
楼主那里好用吗
爬虫的老问题了,ssl error,或者'远程主机强迫关闭了一个现有的连接。',之类的,懒得调试了
没用过iwara,需要设置代理或者设置cookie吗







import requests
from lxml import etree
import os
import re
from subprocess import call
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtWidgets import QApplication, QMainWindow
import sys

#################################
# 函数定义
# 创建文件夹
def mkdir(path):
    # 判断是否存在文件夹如果不存在则创建为文件夹
    # 如果路径不存在会创建这个路径
    folder = os.path.exists(path)
    if not folder:
        os.makedirs(path)
# 使用IDM下载资源
def idm_dld(idm_url, idm_path, idm_name):
    IDMPath = r"C:\Users\zero\Desktop\IDM"
    os.chdir(IDMPath)
    IDM = "IDMan.exe"
    call([IDM, '/d', idm_url, '/p', idm_path, '/f', idm_name, '/a'])
    call([IDM, '/s'])
# 提取视频源
def get_source_url(url):
    url_api = url.split('videos', 1)[0] + 'api/video' + url.split('videos', 1)[1]
    response = requests.get(url_api)
    js = response.json()
    for item in js:
        if item['resolution'] == 'Source':
            source_url = 'https:' + item['uri']
            return source_url
# 从页面提取视频网址
def get_urls(search_url):
    if search_url == '':
        return []
    requests.get(search_url)
    web_urls = etree.HTML(requests.get(search_url).text).xpath('//div[@class="field-item even"]/a/@href')
    for i in range(len(web_urls)):
        web_urls = 'https://ecchi.iwara.tv' + web_urls
    return web_urls
# 下载多个网页的视频
# 在expire_year-expire_month之前的视频不会下载
def videos_dld(web_urls, dld_path, expire_year, expire_month):
    # 对每个网址
    for i in range(len(web_urls)):
        print(str(i + 1) + '/' + str(len(web_urls)) + '\t' + web_urls)
        if web_urls.find('videos') >= 0:
            source_url = get_source_url(web_urls)
            if source_url is None:
                continue
            file_name = re.search('file=(.*?)&op', source_url).group(1)
            author = etree.HTML(requests.get(web_urls).text).xpath('//a[@class="username"]/text()')[0]
            print('\t' + 'Author: ' + author)
            print('\t' + source_url)
            file_name = file_name.replace('%2F', '-')
            print('\t' + 'New name: ' + file_name)
            file_path = dld_path + '\\' + author
            mkdir(file_path)
            local_name = file_path + '\\' + file_name
            old_file_name = file_name.split('-')[3]
            print('\t' + 'Old name: ' + old_file_name)
            old_local_name = file_path + '\\' + old_file_name
            # 判断日期
            if ((int(file_name.split('-')[0]) == expire_year) & (int(file_name.split('-')[1]) >= expire_month) | (int(file_name.split('-')[0]) > expire_year)):
                # 下载
                if (not os.path.isfile(local_name)) & (not os.path.isfile(old_local_name)):
                    print('\tState: no such file, to be downloaded')
                    idm_dld(source_url, file_path, file_name)
                elif os.path.isfile(old_local_name) & os.path.isfile(local_name):
                    print('\tState: old and new file exist')
                    os.remove(old_local_name)
                elif os.path.isfile(old_local_name):
                    print('\tState: old file exist')
                    os.rename(old_local_name, local_name)
                else:
                    print('\tState: new file exist')
            else:
                print('\tEarly date')
                break
# 整合版
def iwara_dld(dld_path, search_url, web_urls, expire_year, expire_month):
    search_urls = get_urls(search_url)
    # print('Download videos from search page')
    videos_dld(search_urls, dld_path, expire_year, expire_month)
    # print('Download videos directly from urls')
    videos_dld(web_urls, dld_path, 2000, 0)


class Ui_MainWindow(object):
    def setupUi(self, MainWindow):
        MainWindow.setObjectName("MainWindow")
        MainWindow.resize(614, 531)
        self.centralwidget = QtWidgets.QWidget(MainWindow)
        self.centralwidget.setObjectName("centralwidget")
        self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)
        self.textBrowser.setGeometry(QtCore.QRect(30, 321, 551, 191))
        self.textBrowser.setObjectName("textBrowser")
        self.label = QtWidgets.QLabel(self.centralwidget)
        self.label.setGeometry(QtCore.QRect(30, 10, 71, 21))
        self.label.setObjectName("label")
        self.lineEdit = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit.setGeometry(QtCore.QRect(120, 10, 311, 21))
        self.lineEdit.setObjectName("lineEdit")
        self.pushButton = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton.setGeometry(QtCore.QRect(460, 10, 111, 21))
        self.pushButton.setObjectName("pushButton")
        self.label_2 = QtWidgets.QLabel(self.centralwidget)
        self.label_2.setGeometry(QtCore.QRect(30, 50, 72, 15))
        self.label_2.setObjectName("label_2")
        self.lineEdit_2 = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit_2.setGeometry(QtCore.QRect(120, 50, 311, 21))
        self.lineEdit_2.setObjectName("lineEdit_2")
        self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton_2.setGeometry(QtCore.QRect(460, 50, 111, 21))
        self.pushButton_2.setObjectName("pushButton_2")
        self.label_3 = QtWidgets.QLabel(self.centralwidget)
        self.label_3.setGeometry(QtCore.QRect(30, 120, 72, 15))
        self.label_3.setObjectName("label_3")
        self.lineEdit_3 = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit_3.setGeometry(QtCore.QRect(120, 120, 261, 21))
        self.lineEdit_3.setObjectName("lineEdit_3")
        self.label_4 = QtWidgets.QLabel(self.centralwidget)
        self.label_4.setGeometry(QtCore.QRect(30, 160, 72, 15))
        self.label_4.setObjectName("label_4")
        self.lineEdit_4 = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit_4.setGeometry(QtCore.QRect(120, 160, 81, 21))
        self.lineEdit_4.setObjectName("lineEdit_4")
        self.label_5 = QtWidgets.QLabel(self.centralwidget)
        self.label_5.setGeometry(QtCore.QRect(30, 200, 72, 15))
        self.label_5.setObjectName("label_5")
        self.lineEdit_5 = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit_5.setGeometry(QtCore.QRect(120, 200, 81, 21))
        self.lineEdit_5.setObjectName("lineEdit_5")
        self.pushButton_3 = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton_3.setGeometry(QtCore.QRect(240, 200, 111, 21))
        self.pushButton_3.setObjectName("pushButton_3")
        self.pushButton_4 = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton_4.setGeometry(QtCore.QRect(450, 200, 111, 21))
        self.pushButton_4.setObjectName("pushButton_4")
        MainWindow.setCentralWidget(self.centralwidget)
        self.menubar = QtWidgets.QMenuBar(MainWindow)
        self.menubar.setGeometry(QtCore.QRect(0, 0, 614, 26))
        self.menubar.setObjectName("menubar")
        MainWindow.setMenuBar(self.menubar)
        self.statusbar = QtWidgets.QStatusBar(MainWindow)
        self.statusbar.setObjectName("statusbar")
        MainWindow.setStatusBar(self.statusbar)

        self.retranslateUi(MainWindow)
        QtCore.QMetaObject.connectSlotsByName(MainWindow)

    def retranslateUi(self, MainWindow):
        _translate = QtCore.QCoreApplication.translate
        MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
        self.label.setText(_translate("MainWindow", "url"))
        self.pushButton.setText(_translate("MainWindow", "直链下载"))
        self.label_2.setText(_translate("MainWindow", "url"))
        self.pushButton_2.setText(_translate("MainWindow", "下载全部视频"))
        self.label_3.setText(_translate("MainWindow", "下载目录"))
        self.label_4.setText(_translate("MainWindow", "年份"))
        self.label_5.setText(_translate("MainWindow", "月份"))
        self.pushButton_3.setText(_translate("MainWindow", "更改下载设置"))
        self.pushButton_4.setText(_translate("MainWindow", "查看下载设置"))

class MyWindow(QMainWindow ,Ui_MainWindow):
    # 设置
    # local_path = "D:\\Another\\temp\\ANOTHER_RUBBISH\\VIDEOS\\Iwara"
    local_path = "C:\\Another\\temp\\iwara"
    # 搜索页面URL
    url = ''
    # 视频网页.
    urls = []
    # 下载该日期之后的视频
    year = 2020
    month = 3
    print('------------------------------------')
    print('Iwara Downloader V1.0')
    print('------------------------------------')
    print('Default Settings:')
    print('Download Directory: %s' % (local_path))
    print('Year: %s' % str(year))
    print('Month: %s' % str(month))
    def __init__(self, parent=None):
        super(MyWindow, self).__init__(parent)
        self.setupUi(self)
        self.pushButton.clicked.connect(self.cmd1)
        self.pushButton_2.clicked.connect(self.cmd2)
        self.pushButton_3.clicked.connect(self.cmd3)
        self.pushButton_4.clicked.connect(self.cmd4)

    def cmd1(self):
        url = ''
        urls = []
        temp=self.lineEdit.text()
        urls.append(temp)
        iwara_dld(self.local_path, url, urls, self.year, self.month)
    def cmd2(self):
        url = ''
        urls = []
        url = self.lineEdit_2.text()
        iwara_dld(self.local_path, url, urls, self.year, self.month)
    def cmd3(self):
        try:
            self.local_path = self.lineEdit_3.text()
            mkdir(self.local_path)
            self.year = self.lineEdit_4.text()
            self.month = self.lineEdit_5.text()
            self.textBrowser.append('\n更改设置成功')
        except:
            self.local_path = "C:\\Another\\temp\\iwara"
            self.year = 2020
            self.month = 3
    def cmd4(self):
        print('\nView Download Settings :')
        print('Download Directory: %s' % (self.local_path))
        print('Year: %s' % str(self.year))
        print('Month: %s' % str(self.month))
        self.textBrowser.append('\nView Download Settings :')
        self.textBrowser.append('Download Directory: %s' % (self.local_path))
        self.textBrowser.append('Year: %s' % str(self.year))
        self.textBrowser.append('Month: %s' % str(self.month))


if __name__ == '__main__':
    app = QApplication(sys.argv)
    myWin = MyWindow()
    myWin.show()
    sys.exit(app.exec_())

none.gif

dokidokidoki

B3F  2021-04-18 16:04

回 29楼(AsCenSion) 的帖子

requests文档有例程
设置proxies参数,特别简单

requests,让http服务人类