• «
  • 1
  • 2
  • »
  • Pages: 1/2     Go

none.gif

AsCenSion

Iwara爬虫,科学GHS

又闲得慌,写个Iwara爬虫,欢迎大佬指点

复制代码
  1. import requests
  2. from lxml import etree
  3. import os
  4. import re
  5. from subprocess import call
  6. #################################
  7. # 函数定义
  8. # 创建文件夹
  9. def mkdir(path):
  10.     # 判断是否存在文件夹如果不存在则创建为文件夹
  11.     # 如果路径不存在会创建这个路径
  12.     folder = os.path.exists(path)
  13.     if not folder:
  14.         os.makedirs(path)
  15. # 使用IDM下载资源
  16. def idm_dld(idm_url, idm_path, idm_name):
  17.     IDMPath = "C:\\Green\\IDM 6.36.5"
  18.     os.chdir(IDMPath)
  19.     IDM = "IDMan.exe"
  20.     call([IDM, '/d', idm_url, '/p', idm_path, '/f', idm_name, '/a'])
  21.     call([IDM, '/s'])
  22. # 提取视频源
  23. def get_source_url(url):
  24.     url_api = url.split('videos', 1)[0] + 'api/video' + url.split('videos', 1)[1]
  25.     response = requests.get(url_api)
  26.     js = response.json()
  27.     for item in js:
  28.         if item['resolution'] == 'Source':
  29.             source_url = 'https:' + item['uri']
  30.             return source_url
  31. # 从页面提取视频网址
  32. def get_urls(search_url):
  33.     if search_url == '':
  34.         return []
  35.     requests.get(search_url)
  36.     web_urls = etree.HTML(requests.get(search_url).text).xpath('//div[@class="field-item even"]/a/@href')
  37.     for i in range(len(web_urls)):
  38.         web_urls[i] = 'https://ecchi.iwara.tv' + web_urls[i]
  39.     return web_urls
  40. # 下载多个网页的视频
  41. # 在expire_year-expire_month之前的视频不会下载
  42. def videos_dld(web_urls, dld_path, expire_year, expire_month):
  43.     # 对每个网址
  44.     for i in range(len(web_urls)):
  45.         print(str(i + 1) + '/' + str(len(web_urls)) + '\t' + web_urls[i])
  46.         if web_urls[i].find('videos') >= 0:
  47.             source_url = get_source_url(web_urls[i])
  48.             if source_url is None:
  49.                 continue
  50.             file_name = re.search('file=(.*?)&op', source_url).group(1)
  51.             author = etree.HTML(requests.get(web_urls[i]).text).xpath('//a[@class="username"]/text()')[0]
  52.             print('\t' + 'Author: ' + author)
  53.             print('\t' + source_url)
  54.             file_name = file_name.replace('%2F', '-')
  55.             print('\t' + 'New name: ' + file_name)
  56.             file_path = dld_path + '\\' + author
  57.             mkdir(file_path)
  58.             local_name = file_path + '\\' + file_name
  59.             old_file_name = file_name.split('-')[3]
  60.             print('\t' + 'Old name: ' + old_file_name)
  61.             old_local_name = file_path + '\\' + old_file_name
  62.             # 判断日期
  63.             if ((int(file_name.split('-')[0]) == expire_year) & (int(file_name.split('-')[1]) >= expire_month) | (int(file_name.split('-')[0]) > expire_year)):
  64.                 # 下载
  65.                 if (not os.path.isfile(local_name)) & (not os.path.isfile(old_local_name)):
  66.                     print('\tState: no such file, to be downloaded')
  67.                     idm_dld(source_url, file_path, file_name)
  68.                 elif os.path.isfile(old_local_name) & os.path.isfile(local_name):
  69.                     print('\tState: old and new file exist')
  70.                     os.remove(old_local_name)
  71.                 elif os.path.isfile(old_local_name):
  72.                     print('\tState: old file exist')
  73.                     os.rename(old_local_name, local_name)
  74.                 else:
  75.                     print('\tState: new file exist')
  76.             else:
  77.                 print('\tEarly date')
  78.                 break
  79. # 整合版
  80. def iwara_dld(dld_path, search_url, web_urls, expire_year, expire_month):
  81.     search_urls = get_urls(search_url)
  82.     # print('Download videos from search page')
  83.     videos_dld(search_urls, dld_path, expire_year, expire_month)
  84.     # print('Download videos directly from urls')
  85.     videos_dld(web_urls, dld_path, 2000, 0)
  86. ##################################
  87. # 主程序
  88. ##################################
  89. def main():
  90.     # 设置
  91.     # local_path = "D:\\Another\\temp\\ANOTHER_RUBBISH\\VIDEOS\\Iwara"
  92.     local_path = "C:\\Another\\temp\\iwara"
  93.     # 搜索页面URL
  94.     url = ''
  95.     # 视频网页.
  96.     urls = []
  97.     # 下载该日期之后的视频
  98.     year = 2020
  99.     month = 3
  100.     print('------------------------------------')
  101.     print('Iwara Downloader V1.0')
  102.     print('------------------------------------')
  103.     print('Default Settings:')
  104.     print('Download Directory: %s' % (local_path))
  105.     print('Year: %s' % str(year))
  106.     print('Month: %s' % str(month))
  107.     while True:
  108.         cmd = 0
  109.         try:
  110.             print('------------------------------------\n------------------------------------\nAvailable Functions:')
  111.             print('1.Download Video Directly From Link')
  112.             print('2.Download All Videos In The Searching Page')
  113.             print('3.Change Download Settings')
  114.             print('4.View Download Settings')
  115.             print('5.Quit')
  116.             cmd = int(input('\nChoose Desired Function:'))
  117.         except:
  118.             pass
  119.         if cmd == 1:
  120.             url = ''
  121.             urls = []
  122.             urls.append(input('\nDownload Video Directly From Link (Enter To Skip):'))
  123.             iwara_dld(local_path, url, urls, year, month)
  124.         elif cmd == 2:
  125.             url = ''
  126.             urls = []
  127.             url = input('\nDownload All Videos In The Searching Page (Enter To Skip):')
  128.             iwara_dld(local_path, url, urls, year, month)
  129.         elif cmd == 3:
  130.             try:
  131.                 local_path = input('\nSet Download Directory:')
  132.                 mkdir(local_path)
  133.                 year = int(input('\nSet Year'))
  134.                 month = int(input('\nSet Month'))
  135.             except:
  136.                 local_path = "C:\\Another\\temp\\iwara"
  137.                 year = 2020
  138.                 month = 3
  139.         elif cmd == 4:
  140.             print('\nView Download Settings :')
  141.             print('Download Directory: %s' % (local_path))
  142.             print('Year: %s' % str(year))
  143.             print('Month: %s' % str(month))
  144.         elif cmd == 5:
  145.             print('\nQuit')
  146.             break
  147.         print('Finished\n')
  148. if __name__ == '__main__':
  149.     main()


为加快下载速度调用了IDM,需要把IDM本体安装到“C:\Green\IDM 6.36.5”文件夹中,运行爬虫的时候需要保证IDM已经打开。

度盘IDM链接:
链接: https://pan.baidu.com/s/1ZsjNK8kgrLHxb7AlNZ8gEw
提取码: 9msh
解压后运行“!绿化.bat”就行了

打包的.exe文件:
链接: https://pan.baidu.com/s/1FAoyJEOZHW_rl-xo9BtAFA
提取码: qkdm

使用截图:





欢迎打赏
此帖售价 0 SP币,已有 67 人购买
若发现会员采用欺骗的方法获取财富,请立刻举报,我们会对会员处以2-N倍的罚金,严重者封掉ID!

none.gif

acbd

B1F  2021-04-16 18:08
(学习,爽!)
强啊

none.gif

menboko

这....稍微看了下,用python写的?

5.gif

lunarshiny

B3F  2021-04-16 18:15
(白莲渡世,梵天护航,刀剑相助。)
爬虫我记得一般好像都是用python写的 比较方便

none.gif

suzumi

18行  IDMPath = "C:\\Green\\IDM 6.36.5"这类改路径就可以不用把IDM限定在C盘了。

none.gif

anonymous052

用户被禁言,该主题自动屏蔽!

1374296.jpg

qiuqiumaomao

B6F  2021-04-16 18:45
(喜欢S(头像出处没有))
大佬,最近做深度学习要学python,书给劝退了

7.gif

letsgo

B7F  2021-04-16 18:49
(真实)
不明觉厉

b818a538


rmC7bFRdMXI2Plq.gif

林北

B9F  2021-04-16 19:01
([sell=0]  [/sell])
虽然我没使用,不过还是买了打赏老哥

none.gif

AsCenSion

感谢诸君的资瓷     

277949.jpg

大吗H

B11F  2021-04-16 21:48
(只要是脸蛋好看的女孩子都是我喜欢的类型)
用了这个下BT能变快吗?楼主

none.gif

节操救援

B12F  2021-04-16 22:18
(喔啦)
还没用过,不过射射楼主

none.gif

FFYMM

用个Qt 会漂亮点

102453.jpg

wuya!

高技术力

none.gif

章鱼哥

带科学的GHS就是带劲

none.gif

5ec42f86

好厉害,GHS都这么高科技要求了吗

0U89.md.jpg

沫灵月

太强了吧

none.gif

mo

太厉害了

none.gif

urbbrgroun

马克一下,买了买了

none.gif

古德莉莉安

B20F  2021-04-17 09:53
(123)
牛啊,大佬

427427.jpg

西门小少

  

SB


a11.gif

Melt

支持一个

none.gif

liky

B24F  2021-04-17 15:59
(Liky)
    可以试试定时监控av磁力 , 一更新就提取并创建磁力任务

可畏


none.gif

dokidokidoki

哈哈,和我一样,不面向对象
最近的实验都是用python写的,python真jb好用
可以加个qt5界面,命令行还是太low了,很简单的,   

none.gif

dokidokidoki

在楼主的基础上加了个qt5的界面
看了一下,requests没有设置user-agent,我这里是爬取不到东西的
楼主那里好用吗
爬虫的老问题了,ssl error,或者'远程主机强迫关闭了一个现有的连接。',之类的,懒得调试了
没用过iwara,需要设置代理或者设置cookie吗







import requests
from lxml import etree
import os
import re
from subprocess import call
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtWidgets import QApplication, QMainWindow
import sys

#################################
# 函数定义
# 创建文件夹
def mkdir(path):
    # 判断是否存在文件夹如果不存在则创建为文件夹
    # 如果路径不存在会创建这个路径
    folder = os.path.exists(path)
    if not folder:
        os.makedirs(path)
# 使用IDM下载资源
def idm_dld(idm_url, idm_path, idm_name):
    IDMPath = r"C:\Users\zero\Desktop\IDM"
    os.chdir(IDMPath)
    IDM = "IDMan.exe"
    call([IDM, '/d', idm_url, '/p', idm_path, '/f', idm_name, '/a'])
    call([IDM, '/s'])
# 提取视频源
def get_source_url(url):
    url_api = url.split('videos', 1)[0] + 'api/video' + url.split('videos', 1)[1]
    response = requests.get(url_api)
    js = response.json()
    for item in js:
        if item['resolution'] == 'Source':
            source_url = 'https:' + item['uri']
            return source_url
# 从页面提取视频网址
def get_urls(search_url):
    if search_url == '':
        return []
    requests.get(search_url)
    web_urls = etree.HTML(requests.get(search_url).text).xpath('//div[@class="field-item even"]/a/@href')
    for i in range(len(web_urls)):
        web_urls = 'https://ecchi.iwara.tv' + web_urls
    return web_urls
# 下载多个网页的视频
# 在expire_year-expire_month之前的视频不会下载
def videos_dld(web_urls, dld_path, expire_year, expire_month):
    # 对每个网址
    for i in range(len(web_urls)):
        print(str(i + 1) + '/' + str(len(web_urls)) + '\t' + web_urls)
        if web_urls.find('videos') >= 0:
            source_url = get_source_url(web_urls)
            if source_url is None:
                continue
            file_name = re.search('file=(.*?)&op', source_url).group(1)
            author = etree.HTML(requests.get(web_urls).text).xpath('//a[@class="username"]/text()')[0]
            print('\t' + 'Author: ' + author)
            print('\t' + source_url)
            file_name = file_name.replace('%2F', '-')
            print('\t' + 'New name: ' + file_name)
            file_path = dld_path + '\\' + author
            mkdir(file_path)
            local_name = file_path + '\\' + file_name
            old_file_name = file_name.split('-')[3]
            print('\t' + 'Old name: ' + old_file_name)
            old_local_name = file_path + '\\' + old_file_name
            # 判断日期
            if ((int(file_name.split('-')[0]) == expire_year) & (int(file_name.split('-')[1]) >= expire_month) | (int(file_name.split('-')[0]) > expire_year)):
                # 下载
                if (not os.path.isfile(local_name)) & (not os.path.isfile(old_local_name)):
                    print('\tState: no such file, to be downloaded')
                    idm_dld(source_url, file_path, file_name)
                elif os.path.isfile(old_local_name) & os.path.isfile(local_name):
                    print('\tState: old and new file exist')
                    os.remove(old_local_name)
                elif os.path.isfile(old_local_name):
                    print('\tState: old file exist')
                    os.rename(old_local_name, local_name)
                else:
                    print('\tState: new file exist')
            else:
                print('\tEarly date')
                break
# 整合版
def iwara_dld(dld_path, search_url, web_urls, expire_year, expire_month):
    search_urls = get_urls(search_url)
    # print('Download videos from search page')
    videos_dld(search_urls, dld_path, expire_year, expire_month)
    # print('Download videos directly from urls')
    videos_dld(web_urls, dld_path, 2000, 0)


class Ui_MainWindow(object):
    def setupUi(self, MainWindow):
        MainWindow.setObjectName("MainWindow")
        MainWindow.resize(614, 531)
        self.centralwidget = QtWidgets.QWidget(MainWindow)
        self.centralwidget.setObjectName("centralwidget")
        self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)
        self.textBrowser.setGeometry(QtCore.QRect(30, 321, 551, 191))
        self.textBrowser.setObjectName("textBrowser")
        self.label = QtWidgets.QLabel(self.centralwidget)
        self.label.setGeometry(QtCore.QRect(30, 10, 71, 21))
        self.label.setObjectName("label")
        self.lineEdit = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit.setGeometry(QtCore.QRect(120, 10, 311, 21))
        self.lineEdit.setObjectName("lineEdit")
        self.pushButton = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton.setGeometry(QtCore.QRect(460, 10, 111, 21))
        self.pushButton.setObjectName("pushButton")
        self.label_2 = QtWidgets.QLabel(self.centralwidget)
        self.label_2.setGeometry(QtCore.QRect(30, 50, 72, 15))
        self.label_2.setObjectName("label_2")
        self.lineEdit_2 = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit_2.setGeometry(QtCore.QRect(120, 50, 311, 21))
        self.lineEdit_2.setObjectName("lineEdit_2")
        self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton_2.setGeometry(QtCore.QRect(460, 50, 111, 21))
        self.pushButton_2.setObjectName("pushButton_2")
        self.label_3 = QtWidgets.QLabel(self.centralwidget)
        self.label_3.setGeometry(QtCore.QRect(30, 120, 72, 15))
        self.label_3.setObjectName("label_3")
        self.lineEdit_3 = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit_3.setGeometry(QtCore.QRect(120, 120, 261, 21))
        self.lineEdit_3.setObjectName("lineEdit_3")
        self.label_4 = QtWidgets.QLabel(self.centralwidget)
        self.label_4.setGeometry(QtCore.QRect(30, 160, 72, 15))
        self.label_4.setObjectName("label_4")
        self.lineEdit_4 = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit_4.setGeometry(QtCore.QRect(120, 160, 81, 21))
        self.lineEdit_4.setObjectName("lineEdit_4")
        self.label_5 = QtWidgets.QLabel(self.centralwidget)
        self.label_5.setGeometry(QtCore.QRect(30, 200, 72, 15))
        self.label_5.setObjectName("label_5")
        self.lineEdit_5 = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit_5.setGeometry(QtCore.QRect(120, 200, 81, 21))
        self.lineEdit_5.setObjectName("lineEdit_5")
        self.pushButton_3 = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton_3.setGeometry(QtCore.QRect(240, 200, 111, 21))
        self.pushButton_3.setObjectName("pushButton_3")
        self.pushButton_4 = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton_4.setGeometry(QtCore.QRect(450, 200, 111, 21))
        self.pushButton_4.setObjectName("pushButton_4")
        MainWindow.setCentralWidget(self.centralwidget)
        self.menubar = QtWidgets.QMenuBar(MainWindow)
        self.menubar.setGeometry(QtCore.QRect(0, 0, 614, 26))
        self.menubar.setObjectName("menubar")
        MainWindow.setMenuBar(self.menubar)
        self.statusbar = QtWidgets.QStatusBar(MainWindow)
        self.statusbar.setObjectName("statusbar")
        MainWindow.setStatusBar(self.statusbar)

        self.retranslateUi(MainWindow)
        QtCore.QMetaObject.connectSlotsByName(MainWindow)

    def retranslateUi(self, MainWindow):
        _translate = QtCore.QCoreApplication.translate
        MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
        self.label.setText(_translate("MainWindow", "url"))
        self.pushButton.setText(_translate("MainWindow", "直链下载"))
        self.label_2.setText(_translate("MainWindow", "url"))
        self.pushButton_2.setText(_translate("MainWindow", "下载全部视频"))
        self.label_3.setText(_translate("MainWindow", "下载目录"))
        self.label_4.setText(_translate("MainWindow", "年份"))
        self.label_5.setText(_translate("MainWindow", "月份"))
        self.pushButton_3.setText(_translate("MainWindow", "更改下载设置"))
        self.pushButton_4.setText(_translate("MainWindow", "查看下载设置"))

class MyWindow(QMainWindow ,Ui_MainWindow):
    # 设置
    # local_path = "D:\\Another\\temp\\ANOTHER_RUBBISH\\VIDEOS\\Iwara"
    local_path = "C:\\Another\\temp\\iwara"
    # 搜索页面URL
    url = ''
    # 视频网页.
    urls = []
    # 下载该日期之后的视频
    year = 2020
    month = 3
    print('------------------------------------')
    print('Iwara Downloader V1.0')
    print('------------------------------------')
    print('Default Settings:')
    print('Download Directory: %s' % (local_path))
    print('Year: %s' % str(year))
    print('Month: %s' % str(month))
    def __init__(self, parent=None):
        super(MyWindow, self).__init__(parent)
        self.setupUi(self)
        self.pushButton.clicked.connect(self.cmd1)
        self.pushButton_2.clicked.connect(self.cmd2)
        self.pushButton_3.clicked.connect(self.cmd3)
        self.pushButton_4.clicked.connect(self.cmd4)

    def cmd1(self):
        url = ''
        urls = []
        temp=self.lineEdit.text()
        urls.append(temp)
        iwara_dld(self.local_path, url, urls, self.year, self.month)
    def cmd2(self):
        url = ''
        urls = []
        url = self.lineEdit_2.text()
        iwara_dld(self.local_path, url, urls, self.year, self.month)
    def cmd3(self):
        try:
            self.local_path = self.lineEdit_3.text()
            mkdir(self.local_path)
            self.year = self.lineEdit_4.text()
            self.month = self.lineEdit_5.text()
            self.textBrowser.append('\n更改设置成功')
        except:
            self.local_path = "C:\\Another\\temp\\iwara"
            self.year = 2020
            self.month = 3
    def cmd4(self):
        print('\nView Download Settings :')
        print('Download Directory: %s' % (self.local_path))
        print('Year: %s' % str(self.year))
        print('Month: %s' % str(self.month))
        self.textBrowser.append('\nView Download Settings :')
        self.textBrowser.append('Download Directory: %s' % (self.local_path))
        self.textBrowser.append('Year: %s' % str(self.year))
        self.textBrowser.append('Month: %s' % str(self.month))


if __name__ == '__main__':
    app = QApplication(sys.argv)
    myWin = MyWindow()
    myWin.show()
    sys.exit(app.exec_())

none.gif

AsCenSion

回 27楼(零3零论文4恶法傅) 的帖子

捕捉到大佬     
我这里校园网是可以直连Iwara的

none.gif

AsCenSion

有大佬知道python如何用clash的socks5代理爬取内容么?     

none.gif

dokidokidoki

回 29楼(AsCenSion) 的帖子

requests文档有例程
设置proxies参数,特别简单

requests,让http服务人类
  • «
  • 1
  • 2
  • »
  • Pages: 1/2     Go