Responsive image

AsCenSion - 2021-04-16 18:05 [GF]
又闲得慌,写个Iwara爬虫,欢迎大佬指点

复制代码

  1. import requests
    from lxml import etree
    import os
    import re
    from subprocess import call
    #################################
    # 函数定义

    # 创建文件夹
    def mkdir(path):
        # 判断是否存在文件夹如果不存在则创建为文件夹
        # 如果路径不存在会创建这个路径
        folder = os.path.exists(path)
        if not folder:
            os.makedirs(path)
    # 使用IDM下载资源
    def idm_dld(idm_url, idm_path, idm_name):
        IDMPath = "C:\\Green\\IDM 6.36.5"
        os.chdir(IDMPath)
        IDM = "IDMan.exe"
        call([IDM, '/d', idm_url, '/p', idm_path, '/f', idm_name, '/a'])
        call([IDM, '/s'])
    # 提取视频源
    def get_source_url(url):
        url_api = url.split('videos', 1)[0] + 'api/video' + url.split('videos', 1)[1]
        response = requests.get(url_api)
        js = response.json()
        for item in js:
            if item['resolution'] == 'Source':
                source_url = 'https:' + item['uri']
                return source_url
    # 从页面提取视频网址
    def get_urls(search_url):
        if search_url == '':
            return []
        requests.get(search_url)
        web_urls = etree.HTML(requests.get(search_url).text).xpath('//div[@class="field-item even"]/a/@href')
        for i in range(len(web_urls)):
            web_urls = 'https://ecchi.iwara.tv' + web_urls
        return web_urls
    # 下载多个网页的视频
    # 在expire_year-expire_month之前的视频不会下载
    def videos_dld(web_urls, dld_path, expire_year, expire_month):
        # 对每个网址
        for i in range(len(web_urls)):
            print(str(i + 1) + '/' + str(len(web_urls)) + '\t' + web_urls)
            if web_urls.find('videos') >= 0:
                source_url = get_source_url(web_urls)
                if source_url is None:
                    continue
                file_name = re.search('file=(.*?)&op', source_url).group(1)
                author = etree.HTML(requests.get(web_urls).text).xpath('//a[@class="username"]/text()')[0]
                print('\t' + 'Author: ' + author)
                print('\t' + source_url)
                file_name = file_name.replace('%2F', '-')
                print('\t' + 'New name: ' + file_name)
                file_path = dld_path + '\\' + author
                mkdir(file_path)
                local_name = file_path + '\\' + file_name
                old_file_name = file_name.split('-')[3]
                print('\t' + 'Old name: ' + old_file_name)
                old_local_name = file_path + '\\' + old_file_name
                # 判断日期
                if ((int(file_name.split('-')[0]) == expire_year) & (int(file_name.split('-')[1]) >= expire_month) | (int(file_name.split('-')[0]) > expire_year)):
                    # 下载
                    if (not os.path.isfile(local_name)) & (not os.path.isfile(old_local_name)):
                        print('\tState: no such file, to be downloaded')
                        idm_dld(source_url, file_path, file_name)
                    elif os.path.isfile(old_local_name) & os.path.isfile(local_name):
                        print('\tState: old and new file exist')
                        os.remove(old_local_name)
                    elif os.path.isfile(old_local_name):
                        print('\tState: old file exist')
                        os.rename(old_local_name, local_name)
                    else:
                        print('\tState: new file exist')
                else:
                    print('\tEarly date')
                    break


    # 整合版
    def iwara_dld(dld_path, search_url, web_urls, expire_year, expire_month):
        search_urls = get_urls(search_url)
        # print('Download videos from search page')
        videos_dld(search_urls, dld_path, expire_year, expire_month)
        # print('Download videos directly from urls')
        videos_dld(web_urls, dld_path, 2000, 0)



    ##################################
    # 主程序
    ##################################
    def main():
        # 设置
        # local_path = "D:\\Another\\temp\\ANOTHER_RUBBISH\\VIDEOS\\Iwara"
        local_path = "C:\\Another\\temp\\iwara"
        # 搜索页面URL
        url = ''
        # 视频网页.
        urls = []
        # 下载该日期之后的视频
        year = 2020
        month = 3
        print('------------------------------------')
        print('Iwara Downloader V1.0')
        print('------------------------------------')
        print('Default Settings:')
        print('Download Directory: %s' % (local_path))
        print('Year: %s' % str(year))
        print('Month: %s' % str(month))
        while True:
            cmd = 0
            try:
                print('------------------------------------\n------------------------------------\nAvailable Functions:')
                print('1.Download Video Directly From Link')
                print('2.Download All Videos In The Searching Page')
                print('3.Change Download Settings')
                print('4.View Download Settings')
                print('5.Quit')
                cmd = int(input('\nChoose Desired Function:'))
            except:
                pass
            if cmd == 1:
                url = ''
                urls = []
                urls.append(input('\nDownload Video Directly From Link (Enter To Skip):'))
                iwara_dld(local_path, url, urls, year, month)
            elif cmd == 2:
                url = ''
                urls = []
                url = input('\nDownload All Videos In The Searching Page (Enter To Skip):')
                iwara_dld(local_path, url, urls, year, month)
            elif cmd == 3:
                try:
                    local_path = input('\nSet Download Directory:')
                    mkdir(local_path)
                    year = int(input('\nSet Year'))
                    month = int(input('\nSet Month'))
                except:
                    local_path = "C:\\Another\\temp\\iwara"
                    year = 2020
                    month = 3
            elif cmd == 4:
                print('\nView Download Settings :')
                print('Download Directory: %s' % (local_path))
                print('Year: %s' % str(year))
                print('Month: %s' % str(month))
            elif cmd == 5:
                print('\nQuit')
                break
            print('Finished\n')


    if __name__ == '__main__':
        main()


为加快下载速度调用了IDM,需要把IDM本体安装到“C:\Green\IDM 6.36.5”文件夹中,运行爬虫的时候需要保证IDM已经打开。

度盘IDM链接:
链接: https://pan.baidu.com/s/1ZsjNK8kgrLHxb7AlNZ8gEw
提取码: 9msh
解压后运行“!绿化.bat”就行了

打包的.exe文件:
链接: https://pan.baidu.com/s/1FAoyJEOZHW_rl-xo9BtAFA
提取码: qkdm

使用截图:





欢迎打赏
此帖售价 0 SP币,已有 67 人购买
若发现会员采用欺骗的方法获取财富,请立刻举报,我们会对会员处以2-N倍的罚金,严重者封掉ID!


acbd - 2021-04-16 18:08 [B1F]
强啊


menboko - 2021-04-16 18:09 [B2F]
这....稍微看了下,用python写的?


lunarshiny - 2021-04-16 18:15 [B3F]
爬虫我记得一般好像都是用python写的 比较方便


suzumi - 2021-04-16 18:20 [B4F]
18行  IDMPath = "C:\\Green\\IDM 6.36.5"这类改路径就可以不用把IDM限定在C盘了。


anonymous052 - 2021-04-16 18:33 [B5F]
用户被禁言,该主题自动屏蔽!


qiuqiumaomao - 2021-04-16 18:45 [B6F]
大佬,最近做深度学习要学python,书给劝退了


letsgo - 2021-04-16 18:49 [B7F]
不明觉厉


b818a538 - 2021-04-16 18:51 [B8F]
  


林北 - 2021-04-16 19:01 [B9F]
虽然我没使用,不过还是买了打赏老哥


AsCenSion - 2021-04-16 21:05 [B10F]
感谢诸君的资瓷     


大吗H - 2021-04-16 21:48 [B11F]
用了这个下BT能变快吗?楼主


节操救援 - 2021-04-16 22:18 [B12F]
还没用过,不过射射楼主


FFYMM - 2021-04-16 22:31 [B13F]
用个Qt 会漂亮点


wuya! - 2021-04-17 05:08 [B14F]
高技术力


章鱼哥 - 2021-04-17 08:07 [B15F]
带科学的GHS就是带劲


5ec42f86 - 2021-04-17 08:10 [B16F]
好厉害,GHS都这么高科技要求了吗


沫灵月 - 2021-04-17 08:11 [B17F]
太强了吧


mo - 2021-04-17 08:22 [B18F]
太厉害了


urbbrgroun - 2021-04-17 09:47 [B19F]
马克一下,买了买了


古德莉莉安 - 2021-04-17 09:53 [B20F]
牛啊,大佬


西门小少 - 2021-04-17 10:17 [B21F]
  


SB - 2021-04-17 11:08 [B22F]
mark


Melt - 2021-04-17 13:42 [B23F]
支持一个


liky - 2021-04-17 15:59 [B24F]
    可以试试定时监控av磁力 , 一更新就提取并创建磁力任务


可畏 - 2021-04-17 16:42 [B25F]


dokidokidoki - 2021-04-17 19:58 [B26F]
哈哈,和我一样,不面向对象
最近的实验都是用python写的,python真jb好用
可以加个qt5界面,命令行还是太low了,很简单的,   


dokidokidoki - 2021-04-17 20:58 [B27F]
在楼主的基础上加了个qt5的界面
看了一下,requests没有设置user-agent,我这里是爬取不到东西的
楼主那里好用吗
爬虫的老问题了,ssl error,或者'远程主机强迫关闭了一个现有的连接。',之类的,懒得调试了
没用过iwara,需要设置代理或者设置cookie吗







import requests
from lxml import etree
import os
import re
from subprocess import call
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtWidgets import QApplication, QMainWindow
import sys

#################################
# 函数定义
# 创建文件夹
def mkdir(path):
    # 判断是否存在文件夹如果不存在则创建为文件夹
    # 如果路径不存在会创建这个路径
    folder = os.path.exists(path)
    if not folder:
        os.makedirs(path)
# 使用IDM下载资源
def idm_dld(idm_url, idm_path, idm_name):
    IDMPath = r"C:\Users\zero\Desktop\IDM"
    os.chdir(IDMPath)
    IDM = "IDMan.exe"
    call([IDM, '/d', idm_url, '/p', idm_path, '/f', idm_name, '/a'])
    call([IDM, '/s'])
# 提取视频源
def get_source_url(url):
    url_api = url.split('videos', 1)[0] + 'api/video' + url.split('videos', 1)[1]
    response = requests.get(url_api)
    js = response.json()
    for item in js:
        if item['resolution'] == 'Source':
            source_url = 'https:' + item['uri']
            return source_url
# 从页面提取视频网址
def get_urls(search_url):
    if search_url == '':
        return []
    requests.get(search_url)
    web_urls = etree.HTML(requests.get(search_url).text).xpath('//div[@class="field-item even"]/a/@href')
    for i in range(len(web_urls)):
        web_urls = 'https://ecchi.iwara.tv' + web_urls
    return web_urls
# 下载多个网页的视频
# 在expire_year-expire_month之前的视频不会下载
def videos_dld(web_urls, dld_path, expire_year, expire_month):
    # 对每个网址
    for i in range(len(web_urls)):
        print(str(i + 1) + '/' + str(len(web_urls)) + '\t' + web_urls)
        if web_urls.find('videos') >= 0:
            source_url = get_source_url(web_urls)
            if source_url is None:
                continue
            file_name = re.search('file=(.*?)&op', source_url).group(1)
            author = etree.HTML(requests.get(web_urls).text).xpath('//a[@class="username"]/text()')[0]
            print('\t' + 'Author: ' + author)
            print('\t' + source_url)
            file_name = file_name.replace('%2F', '-')
            print('\t' + 'New name: ' + file_name)
            file_path = dld_path + '\\' + author
            mkdir(file_path)
            local_name = file_path + '\\' + file_name
            old_file_name = file_name.split('-')[3]
            print('\t' + 'Old name: ' + old_file_name)
            old_local_name = file_path + '\\' + old_file_name
            # 判断日期
            if ((int(file_name.split('-')[0]) == expire_year) & (int(file_name.split('-')[1]) >= expire_month) | (int(file_name.split('-')[0]) > expire_year)):
                # 下载
                if (not os.path.isfile(local_name)) & (not os.path.isfile(old_local_name)):
                    print('\tState: no such file, to be downloaded')
                    idm_dld(source_url, file_path, file_name)
                elif os.path.isfile(old_local_name) & os.path.isfile(local_name):
                    print('\tState: old and new file exist')
                    os.remove(old_local_name)
                elif os.path.isfile(old_local_name):
                    print('\tState: old file exist')
                    os.rename(old_local_name, local_name)
                else:
                    print('\tState: new file exist')
            else:
                print('\tEarly date')
                break
# 整合版
def iwara_dld(dld_path, search_url, web_urls, expire_year, expire_month):
    search_urls = get_urls(search_url)
    # print('Download videos from search page')
    videos_dld(search_urls, dld_path, expire_year, expire_month)
    # print('Download videos directly from urls')
    videos_dld(web_urls, dld_path, 2000, 0)


class Ui_MainWindow(object):
    def setupUi(self, MainWindow):
        MainWindow.setObjectName("MainWindow")
        MainWindow.resize(614, 531)
        self.centralwidget = QtWidgets.QWidget(MainWindow)
        self.centralwidget.setObjectName("centralwidget")
        self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)
        self.textBrowser.setGeometry(QtCore.QRect(30, 321, 551, 191))
        self.textBrowser.setObjectName("textBrowser")
        self.label = QtWidgets.QLabel(self.centralwidget)
        self.label.setGeometry(QtCore.QRect(30, 10, 71, 21))
        self.label.setObjectName("label")
        self.lineEdit = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit.setGeometry(QtCore.QRect(120, 10, 311, 21))
        self.lineEdit.setObjectName("lineEdit")
        self.pushButton = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton.setGeometry(QtCore.QRect(460, 10, 111, 21))
        self.pushButton.setObjectName("pushButton")
        self.label_2 = QtWidgets.QLabel(self.centralwidget)
        self.label_2.setGeometry(QtCore.QRect(30, 50, 72, 15))
        self.label_2.setObjectName("label_2")
        self.lineEdit_2 = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit_2.setGeometry(QtCore.QRect(120, 50, 311, 21))
        self.lineEdit_2.setObjectName("lineEdit_2")
        self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton_2.setGeometry(QtCore.QRect(460, 50, 111, 21))
        self.pushButton_2.setObjectName("pushButton_2")
        self.label_3 = QtWidgets.QLabel(self.centralwidget)
        self.label_3.setGeometry(QtCore.QRect(30, 120, 72, 15))
        self.label_3.setObjectName("label_3")
        self.lineEdit_3 = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit_3.setGeometry(QtCore.QRect(120, 120, 261, 21))
        self.lineEdit_3.setObjectName("lineEdit_3")
        self.label_4 = QtWidgets.QLabel(self.centralwidget)
        self.label_4.setGeometry(QtCore.QRect(30, 160, 72, 15))
        self.label_4.setObjectName("label_4")
        self.lineEdit_4 = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit_4.setGeometry(QtCore.QRect(120, 160, 81, 21))
        self.lineEdit_4.setObjectName("lineEdit_4")
        self.label_5 = QtWidgets.QLabel(self.centralwidget)
        self.label_5.setGeometry(QtCore.QRect(30, 200, 72, 15))
        self.label_5.setObjectName("label_5")
        self.lineEdit_5 = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit_5.setGeometry(QtCore.QRect(120, 200, 81, 21))
        self.lineEdit_5.setObjectName("lineEdit_5")
        self.pushButton_3 = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton_3.setGeometry(QtCore.QRect(240, 200, 111, 21))
        self.pushButton_3.setObjectName("pushButton_3")
        self.pushButton_4 = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton_4.setGeometry(QtCore.QRect(450, 200, 111, 21))
        self.pushButton_4.setObjectName("pushButton_4")
        MainWindow.setCentralWidget(self.centralwidget)
        self.menubar = QtWidgets.QMenuBar(MainWindow)
        self.menubar.setGeometry(QtCore.QRect(0, 0, 614, 26))
        self.menubar.setObjectName("menubar")
        MainWindow.setMenuBar(self.menubar)
        self.statusbar = QtWidgets.QStatusBar(MainWindow)
        self.statusbar.setObjectName("statusbar")
        MainWindow.setStatusBar(self.statusbar)

        self.retranslateUi(MainWindow)
        QtCore.QMetaObject.connectSlotsByName(MainWindow)

    def retranslateUi(self, MainWindow):
        _translate = QtCore.QCoreApplication.translate
        MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
        self.label.setText(_translate("MainWindow", "url"))
        self.pushButton.setText(_translate("MainWindow", "直链下载"))
        self.label_2.setText(_translate("MainWindow", "url"))
        self.pushButton_2.setText(_translate("MainWindow", "下载全部视频"))
        self.label_3.setText(_translate("MainWindow", "下载目录"))
        self.label_4.setText(_translate("MainWindow", "年份"))
        self.label_5.setText(_translate("MainWindow", "月份"))
        self.pushButton_3.setText(_translate("MainWindow", "更改下载设置"))
        self.pushButton_4.setText(_translate("MainWindow", "查看下载设置"))

class MyWindow(QMainWindow ,Ui_MainWindow):
    # 设置
    # local_path = "D:\\Another\\temp\\ANOTHER_RUBBISH\\VIDEOS\\Iwara"
    local_path = "C:\\Another\\temp\\iwara"
    # 搜索页面URL
    url = ''
    # 视频网页.
    urls = []
    # 下载该日期之后的视频
    year = 2020
    month = 3
    print('------------------------------------')
    print('Iwara Downloader V1.0')
    print('------------------------------------')
    print('Default Settings:')
    print('Download Directory: %s' % (local_path))
    print('Year: %s' % str(year))
    print('Month: %s' % str(month))
    def __init__(self, parent=None):
        super(MyWindow, self).__init__(parent)
        self.setupUi(self)
        self.pushButton.clicked.connect(self.cmd1)
        self.pushButton_2.clicked.connect(self.cmd2)
        self.pushButton_3.clicked.connect(self.cmd3)
        self.pushButton_4.clicked.connect(self.cmd4)

    def cmd1(self):
        url = ''
        urls = []
        temp=self.lineEdit.text()
        urls.append(temp)
        iwara_dld(self.local_path, url, urls, self.year, self.month)
    def cmd2(self):
        url = ''
        urls = []
        url = self.lineEdit_2.text()
        iwara_dld(self.local_path, url, urls, self.year, self.month)
    def cmd3(self):
        try:
            self.local_path = self.lineEdit_3.text()
            mkdir(self.local_path)
            self.year = self.lineEdit_4.text()
            self.month = self.lineEdit_5.text()
            self.textBrowser.append('\n更改设置成功')
        except:
            self.local_path = "C:\\Another\\temp\\iwara"
            self.year = 2020
            self.month = 3
    def cmd4(self):
        print('\nView Download Settings :')
        print('Download Directory: %s' % (self.local_path))
        print('Year: %s' % str(self.year))
        print('Month: %s' % str(self.month))
        self.textBrowser.append('\nView Download Settings :')
        self.textBrowser.append('Download Directory: %s' % (self.local_path))
        self.textBrowser.append('Year: %s' % str(self.year))
        self.textBrowser.append('Month: %s' % str(self.month))


if __name__ == '__main__':
    app = QApplication(sys.argv)
    myWin = MyWindow()
    myWin.show()
    sys.exit(app.exec_())


AsCenSion - 2021-04-17 21:28 [B28F]
捕捉到大佬     
我这里校园网是可以直连Iwara的


AsCenSion - 2021-04-18 15:30 [B29F]
有大佬知道python如何用clash的socks5代理爬取内容么?     


dokidokidoki - 2021-04-18 16:04 [B30F]
requests文档有例程
设置proxies参数,特别简单

requests,让http服务人类






桌面版


Powered by SP Project v1.0 © 2010-2019
Time 0.706973 second(s),query:4 Gzip enabled


Top