1. 数据采集 (Data Acquisition)
通过请求后端接口获取 Base64 格式的验证码图片并保存到本地。
import requests
import base64
import os
import uuid
def download_captcha_images(save_dir="dataset/raw", count=100):
url = "https://mall.banmuhuatian.com:18090/prod-api/captchaImage"
headers = {"User-Agent": "Mozilla/5.0 ..."}
os.makedirs(save_dir, exist_ok=True)
for i in range(count):
try:
res = requests.get(url, headers=headers).json()
if res.get("code") == 200:
img_data = base64.b64decode(res["img"])
# 推荐文件名格式:uuid.jpg,后续标注后改为 label_uuid.jpg
filename = f"{res.get('uuid', uuid.uuid4().hex)}.jpg"
with open(os.path.join(save_dir, filename), "wb") as f:
f.write(img_data)
print(f"[{i+1}] 已下载: {filename}")
except Exception as e:
print(f"错误: {e}")
2. 自动化标注 (Auto-Labeling)
利用 Qwen-VL-Plus 的多模态能力,将图片中的数学表达式(如 3+5=?)转为文本,作为训练集的标签。
def encode_image_to_base64(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def get_label_from_ai(image_path):
api_key = os.getenv("DASHSCOPE_API_KEY")
url = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
payload = {
"model": "qwen-vl-plus",
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": "只输出图片中的数学表达式,如 1+2=?,不要任何解释。"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image_to_base64(image_path)}"}}
]
}]
}
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
response = requests.post(url, headers=headers, json=payload)
return response.json()["choices"][0]["message"]["content"].strip()
3. CNN 模型构建 (Model Architecture)
采用经典的卷积神经网络。由于验证码是固定长度的字符流,输出层通常采用 Captcha_Size * Character_Set_Size 的扁平化设计。
import torch.nn as nn
class CaptchaNet(nn.Module):
def __init__(self, num_chars, char_set_len):
super(CaptchaNet, self).__init__()
# 卷积层提取特征
self.layer1 = nn.Sequential(
nn.Conv2d(3, 16, 3, padding=1), nn.BatchNorm2d(16), nn.ReLU(), nn.MaxPool2d(2))
self.layer2 = nn.Sequential(
nn.Conv2d(16, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2))
self.layer3 = nn.Sequential(
nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(2))
# 全连接层分类 (假设输入 160x60 -> 经过3次Pool -> 20x7)
self.fc = nn.Sequential(
nn.Linear(128 * 20 * 7, 1024),
nn.Dropout(0.2),
nn.ReLU(),
nn.Linear(1024, num_chars * char_set_len)
)
def forward(self, x):
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = x.view(x.size(0), -1)
return self.fc(x)
4. 模型部署 (ONNX + Flask)
训练完成后,将 PyTorch 模型转换为 ONNX 格式,以获得更好的推理性能,并使用 Flask 提供 API。
模型导出
import torch
def export_to_onnx(model, save_path="captcha.onnx"):
dummy_input = torch.randn(1, 3, 60, 160)
torch.onnx.export(model, dummy_input, save_path,
input_names=['input'], output_names=['output'])
Flask 服务端
from flask import Flask, request, jsonify
import onnxruntime as ort
import numpy as np
app = Flask(__name__)
session = ort.InferenceSession("captcha.onnx")
@app.route('/predict', methods=['POST'])
def predict():
file = request.files['image']
# 1. 图像预处理 (Resize, Normalize)
# 2. 推理
ort_inputs = {session.get_inputs()[0].name: processed_img}
preds = session.run(None, ort_inputs)[0]
# 3. 解析结果
result = decode_prediction(preds)
return jsonify({"result": result})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)