stock_cursor_v0/backend/sentiment_monitor.py

"""社区情绪监控 — 爬取分析东方财富/雪球热帖，量化散户情绪。

功能：
1. 爬取社区热帖
2. 情绪分析（乐观/悲观）
3. 热议股票排行
4. 关键词提取和词云
5. 情绪与股价相关性分析
"""
import datetime as dt
import json
import re
from typing import List, Dict, Any, Optional
from collections import Counter, defaultdict
import requests
from bs4 import BeautifulSoup
import jieba
import jieba.analyse
from sqlalchemy import select, func, and_, desc

from db import get_session
from models import SocialPost, SentimentIndex, DailyQuote, StockMetric

# 情绪关键词库
BULLISH_KEYWORDS = [
    '看多', '看好', '买入', '加仓', '抄底', '突破', '上涨', '暴涨', '牛市',
    '利好', '反弹', '强势', '拉升', '涨停', '走强', '看涨', '做多'
]

BEARISH_KEYWORDS = [
    '看空', '看跌', '卖出', '减仓', '止损', '下跌', '暴跌', '熊市',
    '利空', '回调', '弱势', '下杀', '跌停', '走弱', '做空', '被套'
]

# 停用词
STOP_WORDS = set([
    '的', '了', '是', '在', '我', '有', '和', '就', '不', '人', '都', '一',
    '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有',
    '看', '好', '自己', '这', '那', '以', '为', '而', '能', '他', '对', '于'
])


def crawl_eastmoney_hot(limit: int = 50) -> List[Dict[str, Any]]:
    """爬取东方财富热帖（简化版，实际需要处理反爬）

    注意：由于反爬限制，这里返回模拟数据
    实际生产环境需要：
    1. 使用代理IP
    2. 模拟浏览器headers
    3. 控制请求频率
    4. 处理验证码
    """
    # 模拟数据（实际应该爬取真实数据）
    mock_posts = [
        {
            'source': 'eastmoney',
            'post_id': f'em_{i}',
            'title': f'模拟帖子{i}：今天大盘要反弹了',
            'content': '技术分析显示底部信号明显，建议逢低买入',
            'author': f'用户{i}',
            'comment_count': 100 + i * 10,
            'view_count': 1000 + i * 100,
        }
        for i in range(limit)
    ]

    return mock_posts


def crawl_xueqiu_hot(limit: int = 50) -> List[Dict[str, Any]]:
    """爬取雪球热帖（简化版）"""
    # 雪球API（需要cookie和token）
    # 实际使用需要登录后获取token

    mock_posts = [
        {
            'source': 'xueqiu',
            'post_id': f'xq_{i}',
            'title': f'雪球热议{i}：半导体板块分析',
            'content': '从产业链角度看，半导体景气度回升',
            'author': f'雪球用户{i}',
            'comment_count': 50 + i * 5,
            'view_count': 500 + i * 50,
        }
        for i in range(limit)
    ]

    return mock_posts


def analyze_sentiment(text: str) -> str:
    """分析文本情绪

    Args:
        text: 待分析文本

    Returns:
        情绪标签：bullish/bearish/neutral
    """
    text_lower = text.lower()

    bullish_score = sum(1 for kw in BULLISH_KEYWORDS if kw in text_lower)
    bearish_score = sum(1 for kw in BEARISH_KEYWORDS if kw in text_lower)

    if bullish_score > bearish_score and bullish_score >= 2:
        return 'bullish'
    elif bearish_score > bullish_score and bearish_score >= 2:
        return 'bearish'
    else:
        return 'neutral'


def extract_keywords(text: str, top_n: int = 10) -> List[str]:
    """提取关键词

    Args:
        text: 文本内容
        top_n: 返回前N个关键词

    Returns:
        关键词列表
    """
    # 使用jieba提取关键词
    keywords = jieba.analyse.extract_tags(text, topK=top_n, withWeight=False)

    # 过滤停用词
    keywords = [kw for kw in keywords if kw not in STOP_WORDS and len(kw) > 1]

    return keywords[:top_n]


def extract_stock_codes(text: str) -> List[str]:
    """从文本中提取股票代码

    Args:
        text: 文本内容

    Returns:
        股票代码列表
    """
    # 匹配6位数字的股票代码
    pattern = r'\b[036]\d{5}\b'
    codes = re.findall(pattern, text)
    return list(set(codes))


def collect_posts(limit_per_source: int = 50) -> Dict[str, Any]:
    """采集社区帖子

    Args:
        limit_per_source: 每个来源采集数量

    Returns:
        采集结果
    """
    all_posts = []

    # 采集东方财富
    try:
        em_posts = crawl_eastmoney_hot(limit_per_source)
        all_posts.extend(em_posts)
    except Exception as e:
        print(f"[eastmoney] crawl error: {e}")

    # 采集雪球
    try:
        xq_posts = crawl_xueqiu_hot(limit_per_source)
        all_posts.extend(xq_posts)
    except Exception as e:
        print(f"[xueqiu] crawl error: {e}")

    # 分析并存储
    saved_count = 0
    with get_session() as s:
        for post in all_posts:
            # 检查是否已存在
            exists = s.execute(
                select(SocialPost).where(SocialPost.post_id == post['post_id'])
            ).scalar_one_or_none()

            if exists:
                continue

            # 情绪分析
            text = post['title'] + ' ' + post.get('content', '')
            sentiment = analyze_sentiment(text)

            # 提取关键词
            keywords = extract_keywords(text, top_n=5)

            # 提取股票代码
            codes = extract_stock_codes(text)
            code = codes[0] if codes else ''

            # 存储
            record = SocialPost(
                source=post['source'],
                post_id=post['post_id'],
                code=code,
                title=post['title'],
                content=post.get('content', ''),
                author=post.get('author', ''),
                comment_count=post.get('comment_count', 0),
                view_count=post.get('view_count', 0),
                sentiment=sentiment,
                keywords=','.join(keywords)
            )
            s.add(record)
            saved_count += 1

        s.commit()

    return {
        'ok': True,
        'collected': len(all_posts),
        'saved': saved_count
    }


def calculate_sentiment_index(date: Optional[dt.date] = None) -> Dict[str, Any]:
    """计算情绪指数

    Args:
        date: 统计日期，None表示今天

    Returns:
        情绪指数数据
    """
    if date is None:
        date = dt.date.today()

    start = dt.datetime.combine(date, dt.time.min)
    end = dt.datetime.combine(date, dt.time.max)

    with get_session() as s:
        # 统计各情绪数量
        posts = s.execute(
            select(SocialPost)
            .where(
                and_(
                    SocialPost.created_at >= start,
                    SocialPost.created_at <= end
                )
            )
        ).scalars().all()

        if not posts:
            return {'ok': False, 'msg': '暂无数据'}

        bullish_count = sum(1 for p in posts if p.sentiment == 'bullish')
        bearish_count = sum(1 for p in posts if p.sentiment == 'bearish')
        neutral_count = sum(1 for p in posts if p.sentiment == 'neutral')
        total = len(posts)

        bullish_ratio = bullish_count / total * 100 if total > 0 else 0

        # 提取热门关键词
        all_keywords = []
        for p in posts:
            if p.keywords:
                all_keywords.extend(p.keywords.split(','))

        keyword_counter = Counter(all_keywords)
        top_keywords = [
            {'word': kw, 'count': cnt}
            for kw, cnt in keyword_counter.most_common(20)
        ]

        # 存储情绪指数
        index_record = s.execute(
            select(SentimentIndex).where(SentimentIndex.date == date)
        ).scalar_one_or_none()

        if index_record:
            index_record.bullish_count = bullish_count
            index_record.bearish_count = bearish_count
            index_record.neutral_count = neutral_count
            index_record.bullish_ratio = bullish_ratio
            index_record.total_posts = total
            index_record.top_keywords = json.dumps(top_keywords, ensure_ascii=False)
            index_record.updated_at = dt.datetime.now()
        else:
            index_record = SentimentIndex(
                date=date,
                bullish_count=bullish_count,
                bearish_count=bearish_count,
                neutral_count=neutral_count,
                bullish_ratio=bullish_ratio,
                total_posts=total,
                top_keywords=json.dumps(top_keywords, ensure_ascii=False)
            )
            s.add(index_record)

        s.commit()

    return {
        'ok': True,
        'date': date.isoformat(),
        'bullish_count': bullish_count,
        'bearish_count': bearish_count,
        'neutral_count': neutral_count,
        'bullish_ratio': round(bullish_ratio, 2),
        'total_posts': total,
        'top_keywords': top_keywords
    }


def get_hot_stocks(days: int = 1, limit: int = 20) -> Dict[str, Any]:
    """获取热议股票排行

    Args:
        days: 统计天数
        limit: 返回数量

    Returns:
        热议股票列表
    """
    since = dt.datetime.now() - dt.timedelta(days=days)

    with get_session() as s:
        # 按股票代码分组统计
        stmt = (
            select(
                SocialPost.code,
                func.count().label('post_count'),
                func.sum(SocialPost.comment_count).label('total_comments'),
                func.sum(SocialPost.view_count).label('total_views')
            )
            .where(
                and_(
                    SocialPost.code != '',
                    SocialPost.created_at >= since
                )
            )
            .group_by(SocialPost.code)
            .order_by(desc('post_count'))
            .limit(limit)
        )

        rows = s.execute(stmt).all()

        if not rows:
            return {'ok': False, 'msg': '暂无数据'}

        # 获取股票名称和最新价格
        codes = [r.code for r in rows]
        metrics = {}
        for m in s.execute(
            select(StockMetric)
            .where(StockMetric.code.in_(codes))
        ).scalars():
            metrics[m.code] = {
                'name': m.name,
                'close': m.close,
                'pct': m.pct
            }

        results = []
        for r in rows:
            info = metrics.get(r.code, {'name': r.code, 'close': 0, 'pct': 0})
            results.append({
                'code': r.code,
                'name': info['name'],
                'post_count': r.post_count,
                'total_comments': r.total_comments or 0,
                'total_views': r.total_views or 0,
                'heat_score': r.post_count * 10 + (r.total_comments or 0),
                'close': info['close'],
                'pct': info['pct']
            })

        # 按热度评分排序
        results.sort(key=lambda x: x['heat_score'], reverse=True)

    return {
        'ok': True,
        'days': days,
        'stocks': results
    }


def get_sentiment_history(days: int = 30) -> Dict[str, Any]:
    """获取情绪指数历史

    Args:
        days: 统计天数

    Returns:
        历史数据
    """
    since = dt.date.today() - dt.timedelta(days=days)

    with get_session() as s:
        rows = s.execute(
            select(SentimentIndex)
            .where(SentimentIndex.date >= since)
            .order_by(SentimentIndex.date)
        ).scalars().all()

    if not rows:
        return {'ok': False, 'msg': '暂无历史数据'}

    return {
        'ok': True,
        'dates': [r.date.isoformat() for r in rows],
        'bullish_ratio': [round(r.bullish_ratio, 2) for r in rows],
        'total_posts': [r.total_posts for r in rows]
    }


def analyze_sentiment_correlation(code: str, days: int = 60) -> Dict[str, Any]:
    """分析情绪与股价相关性

    Args:
        code: 股票代码
        days: 分析天数

    Returns:
        相关性分析结果
    """
    since = dt.date.today() - dt.timedelta(days=days)

    with get_session() as s:
        # 获取该股票的讨论量和情绪
        posts = s.execute(
            select(SocialPost)
            .where(
                and_(
                    SocialPost.code == code,
                    func.date(SocialPost.created_at) >= since
                )
            )
        ).scalars().all()

        if not posts:
            return {'ok': False, 'msg': '该股票暂无社区数据'}

        # 按日期聚合
        daily_sentiment = defaultdict(lambda: {'bullish': 0, 'bearish': 0, 'neutral': 0, 'total': 0})
        for p in posts:
            date = p.created_at.date()
            daily_sentiment[date][p.sentiment] += 1
            daily_sentiment[date]['total'] += 1

        # 获取股价数据
        prices = {}
        for q in s.execute(
            select(DailyQuote)
            .where(
                and_(
                    DailyQuote.code == code,
                    DailyQuote.date >= since
                )
            )
            .order_by(DailyQuote.date)
        ).scalars():
            prices[q.date] = {
                'close': float(q.close),
                'pct': ((float(q.close) - float(q.open)) / float(q.open) * 100) if q.open > 0 else 0
            }

    if not prices:
        return {'ok': False, 'msg': '缺少股价数据'}

    # 计算相关性（简化版）
    dates = sorted(set(daily_sentiment.keys()) & set(prices.keys()))

    if len(dates) < 10:
        return {'ok': False, 'msg': '数据点不足'}

    sentiment_scores = []
    price_changes = []

    for date in dates:
        s_data = daily_sentiment[date]
        bullish_ratio = s_data['bullish'] / s_data['total'] * 100 if s_data['total'] > 0 else 50
        sentiment_scores.append(bullish_ratio)

        price_changes.append(prices[date]['pct'])

    # 计算相关系数（简化版）
    import numpy as np
    if len(sentiment_scores) > 1:
        correlation = np.corrcoef(sentiment_scores, price_changes)[0, 1]
    else:
        correlation = 0

    return {
        'ok': True,
        'code': code,
        'days': days,
        'data_points': len(dates),
        'correlation': round(float(correlation), 3),
        'interpretation': _interpret_correlation(correlation),
        'dates': [d.isoformat() for d in dates],
        'sentiment_scores': [round(s, 2) for s in sentiment_scores],
        'price_changes': [round(p, 2) for p in price_changes]
    }


def _interpret_correlation(corr: float) -> str:
    """解释相关系数"""
    if corr > 0.7:
        return '强正相关：情绪高涨时股价往往上涨'
    elif corr > 0.3:
        return '中度正相关：情绪与股价有一定同步性'
    elif corr > -0.3:
        return '弱相关：情绪与股价关系不明显'
    elif corr > -0.7:
        return '中度负相关：情绪高涨时股价反而下跌（反向指标）'
    else:
        return '强负相关：典型反向指标，情绪越乐观越要警惕'


def get_keyword_cloud(days: int = 7, top_n: int = 50) -> Dict[str, Any]:
    """获取关键词云数据

    Args:
        days: 统计天数
        top_n: 返回前N个关键词

    Returns:
        词云数据
    """
    since = dt.datetime.now() - dt.timedelta(days=days)

    with get_session() as s:
        posts = s.execute(
            select(SocialPost)
            .where(SocialPost.created_at >= since)
        ).scalars().all()

        if not posts:
            return {'ok': False, 'msg': '暂无数据'}

        # 收集所有关键词
        all_keywords = []
        for p in posts:
            if p.keywords:
                all_keywords.extend(p.keywords.split(','))

        # 统计词频
        keyword_counter = Counter(all_keywords)

        # 格式化为词云数据
        wordcloud_data = [
            {'name': kw, 'value': cnt}
            for kw, cnt in keyword_counter.most_common(top_n)
        ]

    return {
        'ok': True,
        'days': days,
        'keywords': wordcloud_data
    }