2. 一个简单的糗百客户端

这是初学python时的一个练手脚本,使用了:

  • wxpython
  • requests
  • BeautifulSoup4
  • ...

功能:

  • 后台爬取数据
  • 预先加载
  • 图片也能显示
  • 类别切换
  • ... 等等

给一个编译后的: qb.exe

这是效果图

../_images/qb.png

源码:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
# coding:utf-8

from __future__ import absolute_import, unicode_literals

__author__ = "golden"
__date__ = '2017/8/3'

# !/usr/bin/python
# -*- coding:utf-8 -*-


import wx
import threading
import time
import sys, os
import requests
from bs4 import BeautifulSoup


########################################################################
class SpiderThread(threading.Thread):
    """
        爬虫 线程
    """

    # ----------------------------------------------------------------------
    def __init__(self, category, page_data, current_page, current_item, show_info, set_status_text, lock, name):
        threading.Thread.__init__(self)
        self.category = category
        self.page_data = page_data
        self.current_page = current_page
        self.current_item = current_item
        self.set_status_text = set_status_text
        self.show_info = show_info
        self.lock = lock
        self.name = name

    def run(self):
        self.load_page()

    def get_page(self, page_num):
        wx.CallAfter(self.set_status_text, message='正在加载第 %s 页...' % str(page_num))
        URL = "http://www.qiushibaike.com/" + self.category + "/page/" + str(page_num)
        agent_header = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = {'User-Agent': agent_header}
        page_content = requests.get(URL, headers=headers).content  # .decode('utf-8').encode('GBK','ignore')
        soup = BeautifulSoup(page_content, "lxml")
        items1 = soup.find_all('div', class_='article')
        item_number = 1
        if not os.path.exists('tmp_jpg'):
            os.makedirs('tmp_jpg')
        if not os.path.exists(r'tmp_jpg/def.jpg'):
            ir = requests.get('http://pic8.nipic.com/20100703/4887831_015505282659_2.jpg')
            open('tmp_jpg/def.jpg', 'wb').write(ir.content)
        for item in items1:
            try:
                myjpg = item.find('div', class_='thumb').find('img')
            except Exception as e:
                myjpg = None
            stats_vote = item.find('span', class_='stats-vote').find('i').get_text()  ####提取好笑个数
            stats_comments = item.find('span', class_='stats-comments').find('i').get_text()  ####提取回复个数
            voting = [span.get_text() for span in item.find_all('span', class_='number hidden')]  ####提取顶、拍个数
            auth = item.find('h2').get_text().strip()
            content = item.find('div', class_='content').find('span').get_text().strip()
            if page_num not in self.page_data.keys():
                self.page_data[page_num] = {}
            self.page_data[page_num].update(
                {item_number: {
                    'auth': auth,
                    'content': content,
                    'stats_vote': stats_vote,
                    'stats_comments': stats_comments,
                    'voting_up': voting[0],
                    'voting_down': voting[1],
                    'jpg': '',
                    'jpg_name': '',
                }})
            if myjpg:
                jpg_name = myjpg['src'].split('/')[-1]
                ir = requests.get(myjpg['src'].replace('//', 'http://'))
                open('tmp_jpg/' + jpg_name, 'wb').write(ir.content)
                self.page_data[page_num][item_number].update({
                    'jpg': myjpg,
                    'jpg_name': jpg_name,
                })
            item_number += 1
            wx.CallAfter(self.set_status_text, message='第 %s 页已加载 %s 条' % (str(page_num), str(item_number)))
        wx.CallAfter(self.show_info, message='第%s页(共%s条)加载完成。' % (str(page_num), str(item_number)))

    def load_page(self):
        while self.lock.isSet():
            if self.page_count < self.current_page + 2:
                try:
                    self.get_page(str(self.page_count + 1))
                    time.sleep(0.1)
                except Exception as ex:
                    msg = u'无法连接糗百:%s' % ex
                    wx.CallAfter(self.set_status_text, message=msg)
                    time.sleep(1)
            else:
                msg = u'加载爬虫休眠中...'
                wx.CallAfter(self.set_status_text, message=msg)
                time.sleep(3)
        wx.CallAfter(self.set_status_text, message='%s 成功退出。' % self.name)

    @property
    def page_count(self):
        return len(self.page_data.keys())


########################################################################
class MyFrame(wx.Frame):
    """
        重构Frame
    """

    # ----------------------------------------------------------------------
    def __init__(self, page_data, current_page, current_item):
        self.page_data = page_data
        self.current_page = current_page
        self.current_item = current_item
        self.current_page_data = {}
        self.current_item_data = {}
        self.category = 'hot'
        self.lock = threading.Event()
        wx.Frame.__init__(self, None, -1, u'我的糗百客户端', size=(600, 720))
        self.create_menu_bar()
        panel = wx.Panel(self, -1)
        panel.SetBackgroundColour('white')
        self.qbtext = wx.TextCtrl(panel, -1, pos=(100, 10), size=(400, 150),
                                  style=wx.TE_CENTER | wx.TE_READONLY | wx.TE_MULTILINE | wx.TE_NOHIDESEL | wx.TE_RICH2)
        self.stc = wx.StaticText(panel, -1, pos=(150, 0))
        self.stccom = wx.StaticText(panel, -1, pos=(150, 155))
        self.jpgbutton = wx.BitmapButton(panel, -1)
        self.status_bar = self.CreateStatusBar()
        next_button = wx.Button(panel, label=u'下一条', pos=(520, 300), size=(40, 100), style=wx.BU_ALIGN_MASK)
        next_button.Bind(wx.EVT_BUTTON, self.next_item, next_button)
        previous_button = wx.Button(panel, label=u'上一条', pos=(20, 300), size=(40, 100), style=wx.BU_ALIGN_MASK)
        previous_button.Bind(wx.EVT_BUTTON, self.previous_item, previous_button)
        self.jpgbutton.Bind(wx.EVT_BUTTON, self.next_item, self.jpgbutton)
        self.show_info()
        self.Show()

    def show_info(self, message=''):
        self.load_info()
        if self.current_item_data:
            text = self.current_item_data.get('content')
            voting_up = self.current_item_data.get('voting_up')
            voting_down = self.current_item_data.get('voting_down')
            stats_comments = self.current_item_data.get('stats_comments')
            stats_vote = self.current_item_data.get('stats_vote')
            auth = self.current_item_data.get('auth')
        else:
            text = '正在加载中...'
            voting_up = 0
            voting_down = 0
            stats_comments = 0
            stats_vote = 0
            auth = ''
        self.qbtext.SetLabel(text)
        self.stc.SetLabel(
            u'第 ' + str(self.current_page) + u' 页 第 ' + str(self.current_item) + u' 条 作者:' + auth)
        self.stccom.SetLabel(
            u'%s个顶  %s个拍  %s个评论  %s个好笑' % (voting_up, voting_down, stats_comments, stats_vote))
        self.jpgbutton.SetBitmap(self.jpg)
        self.jpgbutton.SetPosition(self.jpg_pose)
        self.jpgbutton.SetSize(self.jpg_size)

    def set_status_text(self, message):
        if not self.status_bar.GetStatusText() == message:
            self.status_bar.SetStatusText(message)

    def next_item(self, event):
        if self.current_page_item_count > self.current_item:
            self.current_item += 1
        else:
            self.current_page += 1
            self.current_item = 1
            if str(self.current_page) in self.page_data:
                self.current_page_data = self.page_data[str(self.current_page)]
        self.show_info()

    @property
    def current_page_item_count(self):
        return len(self.current_page_data.keys())

    def previous_item(self, event):
        if self.current_item > 1:  # 当前页面大于1,到当前页前一条
            self.current_item -= 1
        else:  # 到前一页最后一条
            if self.current_page > 1:  # 有前一页
                self.current_page -= 1
                self.current_page_data = self.page_data[str(self.current_page)]
                self.current_item = max(self.current_page_data.keys())
            else:
                self.set_status_text(u'前面没有页了')
        self.show_info()

    def load_info(self):
        if not self.current_page_data:
            self.current_page_data = self.page_data.get(str(self.current_page), {})
        if self.current_item in self.current_page_data.keys():
            self.current_item_data = self.current_page_data[self.current_item]
            if self.current_item_data.get('jpg'):
                jpg_path = 'tmp_jpg/' + self.current_item_data.get('jpg_name')
                jpg = wx.Image(jpg_path, type=wx.BITMAP_TYPE_JPEG)
                W, H = jpg.GetWidth(), jpg.GetHeight()
                if (W > 400 and H <= 500) or (W > H and W > 400 and H > 500):
                    H = 400 * H / W
                    W = 400
                elif (W <= 400 and H > 500) or (400 < W < H and H > 500):
                    W = 500 * W / H
                    H = 500
                self.jpg_pose = (300 - W / 2, 420 - H / 2)
                self.jpg_size = (W, H)
                self.jpg = jpg.Rescale(W, H).ConvertToBitmap()
            else:
                jpg_path = 'tmp_jpg/def.jpg'
                self.jpg_pose = (150, 270)
                self.jpg_size = (301, 300)
                self.jpg = wx.Image(jpg_path, type=wx.BITMAP_TYPE_JPEG).ConvertToBitmap()
        else:
            jpg_path = 'tmp_jpg/def.jpg'
            self.jpg_pose = (150, 270)
            self.jpg_size = (301, 300)
            self.jpg = wx.Image(jpg_path, type=wx.BITMAP_TYPE_JPEG).ConvertToBitmap()

    def create_menu_bar(self):
        menu_bar = wx.MenuBar()
        for each in self.menu_data:
            menu_label = each[0]
            menu_item = each[1:]
            menu_bar.Append(self.create_menu(menu_item), menu_label)
        self.SetMenuBar(menu_bar)
        return menu_bar

    def create_menu(self, menu_data):
        menu = wx.Menu()
        kind = wx.ITEM_NORMAL
        for _data in menu_data:
            if len(_data) == 3:
                label, status, handler = _data
            else:
                label, status, handler, kind = _data
            if not label:
                menu.AppendSeparator()
                continue
            menu_item = menu.Append(-1, label, status, kind)
            self.Bind(wx.EVT_MENU, handler, menu_item)
        return menu

    def set_category(self, event):
        categorys = {
            '8hr': '热门',
            'hot': '24小时',
            'imgrank': '热图',
            'text': '文字',
            'history': '穿越',
            'pic': '糗图',
            'textnew': '新鲜'
        }
        categorys = {categorys[key]: key for key in categorys}
        menu_bar = self.GetMenuBar()
        item_id = event.GetId()
        item = menu_bar.FindItemById(item_id)
        category = categorys.get(item.GetLabel())
        self.category = category
        self.page_data = {}
        self.current_page_data = {}
        self.current_item_data = {}
        self.current_page = 1
        self.current_page = 1
        self.show_info()
        self.stop_spider()
        self.start_spider()

    def defa(self):
        pass

    @property
    def menu_data(self):
        return (
            (u"文件",
             (u"新建", u"新建窗口", self.defa),
             (u'关闭', u'关闭当前窗口', self.defa)),
            (u'编辑',
             (u'复制', u'复制结果到剪切板', self.defa),
             (u'粘贴', u'粘贴剪切板内容到输入框', self.defa),
             (u'清空剪切板', u'清空剪切板', self.defa),
             ('', '', ''),
             (u'选项', u'选项', self.defa)),
            (u'分类',
             (u'热门', u'热门', self.set_category, wx.ITEM_RADIO),
             (u'24小时', u'24小时', self.set_category, wx.ITEM_RADIO),
             (u'热图', u'热图', self.set_category, wx.ITEM_RADIO),
             (u'文字', u'文字', self.set_category, wx.ITEM_RADIO),
             (u'穿越', u'穿越', self.set_category, wx.ITEM_RADIO),
             (u'糗图', u'糗图', self.set_category, wx.ITEM_RADIO),
             (u'新鲜', u'新鲜', self.set_category, wx.ITEM_RADIO)),
            (u'帮助',
             (u'关于', u'关于', self.defa))

        )


        ########################################################################

    def start_spider(self):
        self.lock.set()
        sp = SpiderThread(self.category, self.page_data, self.current_page, self.current_item, self.show_info,
                          self.set_status_text, lock=self.lock, name=self.category)
        sp.setDaemon(1)
        sp.start()
        self.spider_thread = sp
        self.set_status_text('%s 爬虫启动。' % self.category)

    def stop_spider(self):
        self.lock.clear()
        while True:
            if self.spider_thread.is_alive():
                time.sleep(1)
            else:
                self.set_status_text('%s 爬虫成功退出' % self.category)
                break


class MyApp(wx.App):
    """
    重构APP
    """

    # ----------------------------------------------------------------------
    def __init__(self, page_data, current_page, current_item):
        """Constructor"""
        wx.App.__init__(self)
        frame = MyFrame(page_data, current_page, current_item)
        frame.start_spider()
        frame.Center()
        frame.Show()
        self.page_data = page_data
        self.current_page = current_page
        self.current_item = current_item
        self.frame = frame

    def MainLoop(self):
        return super(MyApp, self).MainLoop()


########################################################################


if __name__ == '__main__':
    page_data = {}  # 数据
    current_page = 1  # 当前页数,从1开始
    current_item = 1  # 当前条数,从1开始
    app = MyApp(page_data, current_page, current_item)
    app.MainLoop()