打码方式使用机器学习

2019-06-29 11:07:34 +08:00
parent 8cabf157f6
commit 707fd5bd8a
8 changed files with 237 additions and 26 deletions
--- a/py12306/helpers/OCR.py
+++ b/py12306/helpers/OCR.py
@@ -2,7 +2,6 @@ import math
 import random

 from py12306.config import Config
-from py12306.helpers.api import *
 from py12306.helpers.request import Request
 from py12306.log.common_log import CommonLog
 from py12306.vender.ruokuai.main import RKClient
@@ -54,26 +53,21 @@ class OCR:
            positions.append(int(y))
        return positions

-    def get_image_by_free_site(self, img):
-        data = {
-            'base64': img
-        }
-        response = self.session.post(API_FREE_CODE_QCR_API, json=data)
-        result = response.json()
-        if result.get('success') and result.get('data.check'):
-            check_data = {
-                'check': result.get('data.check'),
-                'img_buf': img,
-                'logon': 1,
-                'type': 'D'
-            }
-            check_response = self.session.post(API_FREE_CODE_QCR_API_CHECK, json=check_data)
-            check_result = check_response.json()
-            if check_result.get('res'):
-                position = check_result.get('res')
-                return position.replace('(', '').replace(')', '').split(',')
+    @staticmethod
+    def get_image_by_free_site(img):
+        from py12306.helpers.ocr.ml_predict import get_coordinate
+        import base64
+
+        # 转为图片文件
+        with open('authcode.jpg', 'wb') as image:
+            image.write(base64.b64decode(img))
+
+        result = get_coordinate('authcode.jpg')
+        # CommonLog.print_auth_code_info("验证码识别的结果为：" + result)
+
+        if result:
+            return result

-        CommonLog.print_auto_code_fail(CommonLog.MESSAGE_GET_RESPONSE_FROM_FREE_AUTO_CODE)
        return None


--- a/py12306/helpers/api.py
+++ b/py12306/helpers/api.py
@@ -46,8 +46,4 @@ API_GET_BROWSER_DEVICE_ID = BASE_URL_OF_12306 + '/otn/HttpZF/logdevice'
 API_NOTIFICATION_BY_VOICE_CODE = 'http://ali-voice.showapi.com/sendVoice?'
 API_NOTIFICATION_BY_VOICE_CODE_DINGXIN = 'http://yuyin2.market.alicloudapi.com/dx/voice_notice'

-# API_FREE_CODE_QCR_API = 'http://60.205.200.159/api'  # 19-03-07 接口已失效
-API_FREE_CODE_QCR_API = 'https://12306.jiedanba.cn/api/v2/getCheck'
-API_FREE_CODE_QCR_API_CHECK = 'http://check.huochepiao.360.cn/img_vcode'
-
 API_CHECK_CDN_AVAILABLE = 'https://{}/otn/dynamicJs/omseuuq'
--- a/py12306/helpers/auth_code.py
+++ b/py12306/helpers/auth_code.py
@@ -32,7 +32,11 @@ class AuthCode:
        if not position:  # 打码失败
            return self.retry_get_auth_code()

-        answer = ','.join(map(str, position))
+        if Config().AUTO_CODE_PLATFORM == 'free':
+            answer = position
+        else:
+            answer = ','.join(map(str, position))
+
        if not self.check_code(answer):
            return self.retry_get_auth_code()
        return position
--- a/py12306/helpers/ocr/ml_predict.py
+++ b/py12306/helpers/ocr/ml_predict.py
@@ -0,0 +1,123 @@
+# coding: utf-8
+import cv2, os
+import numpy as np
+from keras import models
+from py12306.log.common_log import CommonLog
+
+
+def get_text(img, offset=0):
+    text = img[3:22, 120 + offset:177 + offset]
+    text = cv2.cvtColor(text, cv2.COLOR_BGR2GRAY)
+    text = text / 255.0
+    h, w = text.shape
+    text.shape = (1, h, w, 1)
+    return text
+
+
+def get_coordinate(fn):
+    # 储存最终坐标结果
+    result = ''
+
+    try:
+        # 读取并预处理验证码
+        img = cv2.imread(fn)
+        text = get_text(img)
+        imgs = np.array(list(_get_imgs(img)))
+        imgs = preprocess_input(imgs)
+
+        # 识别文字
+        model = models.load_model('py12306/helpers/ocr/model.v2.0.h5')
+        label = model.predict(text)
+        label = label.argmax()
+        fp = open('py12306/helpers/ocr/texts.txt', encoding='utf-8')
+        texts = [text.rstrip('\n') for text in fp]
+        text = texts[label]
+
+        # list放文字
+        titles = [text]
+
+        position = []
+
+        # 获取下一个词
+        # 根据第一个词的长度来定位第二个词的位置
+        if len(text) == 1:
+            offset = 27
+        elif len(text) == 2:
+            offset = 47
+        else:
+            offset = 60
+        text2 = get_text(img, offset=offset)
+        if text2.mean() < 0.95:
+            label = model.predict(text2)
+            label = label.argmax()
+            text2 = texts[label]
+            titles.append(text2)
+
+        # 加载图片分类器
+        model = models.load_model('py12306/helpers/ocr/12306.image.model.h5')
+        labels = model.predict(imgs)
+        labels = labels.argmax(axis=1)
+
+        for pos, label in enumerate(labels):
+            # print(pos // 4, pos % 4, texts[label])
+            if len(titles) == 1:
+                if texts[label] == titles[0]:
+                    position.append(pos)
+            elif len(titles) == 2:
+                if texts[label] == titles[0]:
+                    position.append(pos)
+                elif texts[label] == titles[1]:
+                    position.append(pos)
+            elif len(titles) == 3:
+                if texts[label] == titles[0]:
+                    position.append(pos)
+                elif texts[label] == titles[1]:
+                    position.append(pos)
+                elif texts[label] == titles[2]:
+                    position.append(pos)
+
+        # 没有识别到结果
+        if len(position) == 0:
+            return result
+
+        for i in position:
+            if i == 0:
+                result += '31,45,'
+            elif i == 1:
+                result += '100,45,'
+            elif i == 2:
+                result += '170,45,'
+            elif i == 3:
+                result += '240,45,'
+            elif i == 4:
+                result += '30,115,'
+            elif i == 5:
+                result += '100,115,'
+            elif i == 6:
+                result += '170,115,'
+            elif i == 7:
+                result += '240,115,'
+        result = result[:-1]
+    except:
+        CommonLog.print_auto_code_fail(CommonLog.MESSAGE_GET_RESPONSE_FROM_FREE_AUTO_CODE)
+    return result
+
+
+def preprocess_input(x):
+    x = x.astype('float32')
+    # 我是用cv2来读取的图片，其已经是BGR格式了
+    mean = [103.939, 116.779, 123.68]
+    x -= mean
+    return x
+
+
+def _get_imgs(img):
+    interval = 5
+    length = 67
+    for x in range(40, img.shape[0] - length, interval + length):
+        for y in range(interval, img.shape[1] - length, interval + length):
+            yield img[x:x + length, y:y + length]
+
+
+if __name__ == '__main__':
+    print(get_coordinate('a.jpg'))
--- a/py12306/helpers/ocr/model.v2.0.h5
+++ b/py12306/helpers/ocr/model.v2.0.h5
--- a/py12306/helpers/ocr/texts.txt
+++ b/py12306/helpers/ocr/texts.txt
@@ -0,0 +1,80 @@
+打字机
+调色板
+跑步机
+毛线
+老虎
+安全帽
+沙包
+盘子
+本子
+药片
+双面胶
+龙舟
+红酒
+拖把
+卷尺
+海苔
+红豆
+黑板
+热水袋
+烛台
+钟表
+路灯
+沙拉
+海报
+公交卡
+樱桃
+创可贴
+牌坊
+苍蝇拍
+高压锅
+电线
+网球拍
+海鸥
+风铃
+订书机
+冰箱
+话梅
+排风机
+锅铲
+绿豆
+航母
+电子秤
+红枣
+金字塔
+鞭炮
+菠萝
+开瓶器
+电饭煲
+仪表盘
+棉棒
+篮球
+狮子
+蚂蚁
+蜡烛
+茶盅
+印章
+茶几
+啤酒
+档案袋
+挂钟
+刺绣
+铃铛
+护腕
+手掌印
+锦旗
+文具盒
+辣椒酱
+耳塞
+中国结
+蜥蜴
+剪纸
+漏斗
+锣
+蒸笼
+珊瑚
+雨靴
+薯条
+蜜蜂
+日历
+口哨
--- a/py12306/log/common_log.py
+++ b/py12306/log/common_log.py
@@ -139,3 +139,10 @@ class CommonLog(BaseLog):
        self.add_quick_log('打码失败: 错误原因 {reason}'.format(reason=reason))
        self.flush()
        return self
+
+    @classmethod
+    def print_auth_code_info(cls, reason):
+        self = cls()
+        self.add_quick_log('打码信息: {reason}'.format(reason=reason))
+        self.flush()
+        return self
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,4 +30,11 @@ w3lib==1.19.0
 websockets==7.0
 Werkzeug==0.14.1
 DingtalkChatbot==1.3.0
-lightpush==0.1.3
+lightpush==0.1.3
+sklearn==0.0
+opencv-python==4.1.0.25
+keras==2.2.4
+tensorflow==1.14.0
+matplotlib==3.1.0
+numpy==1.16.4
+scipy==1.3.0