claude强化功能

2026-06-14 11:54:45 +08:00
parent cc8dff4e57
commit e524a3589a
43 changed files with 13421 additions and 73 deletions
--- a/backend/sentiment_monitor.py
+++ b/backend/sentiment_monitor.py
@@ -0,0 +1,556 @@
+"""社区情绪监控 — 爬取分析东方财富/雪球热帖，量化散户情绪。
+
+功能：
+1. 爬取社区热帖
+2. 情绪分析（乐观/悲观）
+3. 热议股票排行
+4. 关键词提取和词云
+5. 情绪与股价相关性分析
+"""
+import datetime as dt
+import json
+import re
+from typing import List, Dict, Any, Optional
+from collections import Counter, defaultdict
+import requests
+from bs4 import BeautifulSoup
+import jieba
+import jieba.analyse
+from sqlalchemy import select, func, and_, desc
+
+from db import get_session
+from models import SocialPost, SentimentIndex, DailyQuote, StockMetric
+
+# 情绪关键词库
+BULLISH_KEYWORDS = [
+    '看多', '看好', '买入', '加仓', '抄底', '突破', '上涨', '暴涨', '牛市',
+    '利好', '反弹', '强势', '拉升', '涨停', '走强', '看涨', '做多'
+]
+
+BEARISH_KEYWORDS = [
+    '看空', '看跌', '卖出', '减仓', '止损', '下跌', '暴跌', '熊市',
+    '利空', '回调', '弱势', '下杀', '跌停', '走弱', '做空', '被套'
+]
+
+# 停用词
+STOP_WORDS = set([
+    '的', '了', '是', '在', '我', '有', '和', '就', '不', '人', '都', '一',
+    '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有',
+    '看', '好', '自己', '这', '那', '以', '为', '而', '能', '他', '对', '于'
+])
+
+
+def crawl_eastmoney_hot(limit: int = 50) -> List[Dict[str, Any]]:
+    """爬取东方财富热帖（简化版，实际需要处理反爬）
+    
+    注意：由于反爬限制，这里返回模拟数据
+    实际生产环境需要：
+    1. 使用代理IP
+    2. 模拟浏览器headers
+    3. 控制请求频率
+    4. 处理验证码
+    """
+    # 模拟数据（实际应该爬取真实数据）
+    mock_posts = [
+        {
+            'source': 'eastmoney',
+            'post_id': f'em_{i}',
+            'title': f'模拟帖子{i}：今天大盘要反弹了',
+            'content': '技术分析显示底部信号明显，建议逢低买入',
+            'author': f'用户{i}',
+            'comment_count': 100 + i * 10,
+            'view_count': 1000 + i * 100,
+        }
+        for i in range(limit)
+    ]
+    
+    return mock_posts
+
+
+def crawl_xueqiu_hot(limit: int = 50) -> List[Dict[str, Any]]:
+    """爬取雪球热帖（简化版）"""
+    # 雪球API（需要cookie和token）
+    # 实际使用需要登录后获取token
+    
+    mock_posts = [
+        {
+            'source': 'xueqiu',
+            'post_id': f'xq_{i}',
+            'title': f'雪球热议{i}：半导体板块分析',
+            'content': '从产业链角度看，半导体景气度回升',
+            'author': f'雪球用户{i}',
+            'comment_count': 50 + i * 5,
+            'view_count': 500 + i * 50,
+        }
+        for i in range(limit)
+    ]
+    
+    return mock_posts
+
+
+def analyze_sentiment(text: str) -> str:
+    """分析文本情绪
+    
+    Args:
+        text: 待分析文本
+    
+    Returns:
+        情绪标签：bullish/bearish/neutral
+    """
+    text_lower = text.lower()
+    
+    bullish_score = sum(1 for kw in BULLISH_KEYWORDS if kw in text_lower)
+    bearish_score = sum(1 for kw in BEARISH_KEYWORDS if kw in text_lower)
+    
+    if bullish_score > bearish_score and bullish_score >= 2:
+        return 'bullish'
+    elif bearish_score > bullish_score and bearish_score >= 2:
+        return 'bearish'
+    else:
+        return 'neutral'
+
+
+def extract_keywords(text: str, top_n: int = 10) -> List[str]:
+    """提取关键词
+    
+    Args:
+        text: 文本内容
+        top_n: 返回前N个关键词
+    
+    Returns:
+        关键词列表
+    """
+    # 使用jieba提取关键词
+    keywords = jieba.analyse.extract_tags(text, topK=top_n, withWeight=False)
+    
+    # 过滤停用词
+    keywords = [kw for kw in keywords if kw not in STOP_WORDS and len(kw) > 1]
+    
+    return keywords[:top_n]
+
+
+def extract_stock_codes(text: str) -> List[str]:
+    """从文本中提取股票代码
+    
+    Args:
+        text: 文本内容
+    
+    Returns:
+        股票代码列表
+    """
+    # 匹配6位数字的股票代码
+    pattern = r'\b[036]\d{5}\b'
+    codes = re.findall(pattern, text)
+    return list(set(codes))
+
+
+def collect_posts(limit_per_source: int = 50) -> Dict[str, Any]:
+    """采集社区帖子
+    
+    Args:
+        limit_per_source: 每个来源采集数量
+    
+    Returns:
+        采集结果
+    """
+    all_posts = []
+    
+    # 采集东方财富
+    try:
+        em_posts = crawl_eastmoney_hot(limit_per_source)
+        all_posts.extend(em_posts)
+    except Exception as e:
+        print(f"[eastmoney] crawl error: {e}")
+    
+    # 采集雪球
+    try:
+        xq_posts = crawl_xueqiu_hot(limit_per_source)
+        all_posts.extend(xq_posts)
+    except Exception as e:
+        print(f"[xueqiu] crawl error: {e}")
+    
+    # 分析并存储
+    saved_count = 0
+    with get_session() as s:
+        for post in all_posts:
+            # 检查是否已存在
+            exists = s.execute(
+                select(SocialPost).where(SocialPost.post_id == post['post_id'])
+            ).scalar_one_or_none()
+            
+            if exists:
+                continue
+            
+            # 情绪分析
+            text = post['title'] + ' ' + post.get('content', '')
+            sentiment = analyze_sentiment(text)
+            
+            # 提取关键词
+            keywords = extract_keywords(text, top_n=5)
+            
+            # 提取股票代码
+            codes = extract_stock_codes(text)
+            code = codes[0] if codes else ''
+            
+            # 存储
+            record = SocialPost(
+                source=post['source'],
+                post_id=post['post_id'],
+                code=code,
+                title=post['title'],
+                content=post.get('content', ''),
+                author=post.get('author', ''),
+                comment_count=post.get('comment_count', 0),
+                view_count=post.get('view_count', 0),
+                sentiment=sentiment,
+                keywords=','.join(keywords)
+            )
+            s.add(record)
+            saved_count += 1
+        
+        s.commit()
+    
+    return {
+        'ok': True,
+        'collected': len(all_posts),
+        'saved': saved_count
+    }
+
+
+def calculate_sentiment_index(date: Optional[dt.date] = None) -> Dict[str, Any]:
+    """计算情绪指数
+    
+    Args:
+        date: 统计日期，None表示今天
+    
+    Returns:
+        情绪指数数据
+    """
+    if date is None:
+        date = dt.date.today()
+    
+    start = dt.datetime.combine(date, dt.time.min)
+    end = dt.datetime.combine(date, dt.time.max)
+    
+    with get_session() as s:
+        # 统计各情绪数量
+        posts = s.execute(
+            select(SocialPost)
+            .where(
+                and_(
+                    SocialPost.created_at >= start,
+                    SocialPost.created_at <= end
+                )
+            )
+        ).scalars().all()
+        
+        if not posts:
+            return {'ok': False, 'msg': '暂无数据'}
+        
+        bullish_count = sum(1 for p in posts if p.sentiment == 'bullish')
+        bearish_count = sum(1 for p in posts if p.sentiment == 'bearish')
+        neutral_count = sum(1 for p in posts if p.sentiment == 'neutral')
+        total = len(posts)
+        
+        bullish_ratio = bullish_count / total * 100 if total > 0 else 0
+        
+        # 提取热门关键词
+        all_keywords = []
+        for p in posts:
+            if p.keywords:
+                all_keywords.extend(p.keywords.split(','))
+        
+        keyword_counter = Counter(all_keywords)
+        top_keywords = [
+            {'word': kw, 'count': cnt}
+            for kw, cnt in keyword_counter.most_common(20)
+        ]
+        
+        # 存储情绪指数
+        index_record = s.execute(
+            select(SentimentIndex).where(SentimentIndex.date == date)
+        ).scalar_one_or_none()
+        
+        if index_record:
+            index_record.bullish_count = bullish_count
+            index_record.bearish_count = bearish_count
+            index_record.neutral_count = neutral_count
+            index_record.bullish_ratio = bullish_ratio
+            index_record.total_posts = total
+            index_record.top_keywords = json.dumps(top_keywords, ensure_ascii=False)
+            index_record.updated_at = dt.datetime.now()
+        else:
+            index_record = SentimentIndex(
+                date=date,
+                bullish_count=bullish_count,
+                bearish_count=bearish_count,
+                neutral_count=neutral_count,
+                bullish_ratio=bullish_ratio,
+                total_posts=total,
+                top_keywords=json.dumps(top_keywords, ensure_ascii=False)
+            )
+            s.add(index_record)
+        
+        s.commit()
+    
+    return {
+        'ok': True,
+        'date': date.isoformat(),
+        'bullish_count': bullish_count,
+        'bearish_count': bearish_count,
+        'neutral_count': neutral_count,
+        'bullish_ratio': round(bullish_ratio, 2),
+        'total_posts': total,
+        'top_keywords': top_keywords
+    }
+
+
+def get_hot_stocks(days: int = 1, limit: int = 20) -> Dict[str, Any]:
+    """获取热议股票排行
+    
+    Args:
+        days: 统计天数
+        limit: 返回数量
+    
+    Returns:
+        热议股票列表
+    """
+    since = dt.datetime.now() - dt.timedelta(days=days)
+    
+    with get_session() as s:
+        # 按股票代码分组统计
+        stmt = (
+            select(
+                SocialPost.code,
+                func.count().label('post_count'),
+                func.sum(SocialPost.comment_count).label('total_comments'),
+                func.sum(SocialPost.view_count).label('total_views')
+            )
+            .where(
+                and_(
+                    SocialPost.code != '',
+                    SocialPost.created_at >= since
+                )
+            )
+            .group_by(SocialPost.code)
+            .order_by(desc('post_count'))
+            .limit(limit)
+        )
+        
+        rows = s.execute(stmt).all()
+        
+        if not rows:
+            return {'ok': False, 'msg': '暂无数据'}
+        
+        # 获取股票名称和最新价格
+        codes = [r.code for r in rows]
+        metrics = {}
+        for m in s.execute(
+            select(StockMetric)
+            .where(StockMetric.code.in_(codes))
+        ).scalars():
+            metrics[m.code] = {
+                'name': m.name,
+                'close': m.close,
+                'pct': m.pct
+            }
+        
+        results = []
+        for r in rows:
+            info = metrics.get(r.code, {'name': r.code, 'close': 0, 'pct': 0})
+            results.append({
+                'code': r.code,
+                'name': info['name'],
+                'post_count': r.post_count,
+                'total_comments': r.total_comments or 0,
+                'total_views': r.total_views or 0,
+                'heat_score': r.post_count * 10 + (r.total_comments or 0),
+                'close': info['close'],
+                'pct': info['pct']
+            })
+        
+        # 按热度评分排序
+        results.sort(key=lambda x: x['heat_score'], reverse=True)
+    
+    return {
+        'ok': True,
+        'days': days,
+        'stocks': results
+    }
+
+
+def get_sentiment_history(days: int = 30) -> Dict[str, Any]:
+    """获取情绪指数历史
+    
+    Args:
+        days: 统计天数
+    
+    Returns:
+        历史数据
+    """
+    since = dt.date.today() - dt.timedelta(days=days)
+    
+    with get_session() as s:
+        rows = s.execute(
+            select(SentimentIndex)
+            .where(SentimentIndex.date >= since)
+            .order_by(SentimentIndex.date)
+        ).scalars().all()
+    
+    if not rows:
+        return {'ok': False, 'msg': '暂无历史数据'}
+    
+    return {
+        'ok': True,
+        'dates': [r.date.isoformat() for r in rows],
+        'bullish_ratio': [round(r.bullish_ratio, 2) for r in rows],
+        'total_posts': [r.total_posts for r in rows]
+    }
+
+
+def analyze_sentiment_correlation(code: str, days: int = 60) -> Dict[str, Any]:
+    """分析情绪与股价相关性
+    
+    Args:
+        code: 股票代码
+        days: 分析天数
+    
+    Returns:
+        相关性分析结果
+    """
+    since = dt.date.today() - dt.timedelta(days=days)
+    
+    with get_session() as s:
+        # 获取该股票的讨论量和情绪
+        posts = s.execute(
+            select(SocialPost)
+            .where(
+                and_(
+                    SocialPost.code == code,
+                    func.date(SocialPost.created_at) >= since
+                )
+            )
+        ).scalars().all()
+        
+        if not posts:
+            return {'ok': False, 'msg': '该股票暂无社区数据'}
+        
+        # 按日期聚合
+        daily_sentiment = defaultdict(lambda: {'bullish': 0, 'bearish': 0, 'neutral': 0, 'total': 0})
+        for p in posts:
+            date = p.created_at.date()
+            daily_sentiment[date][p.sentiment] += 1
+            daily_sentiment[date]['total'] += 1
+        
+        # 获取股价数据
+        prices = {}
+        for q in s.execute(
+            select(DailyQuote)
+            .where(
+                and_(
+                    DailyQuote.code == code,
+                    DailyQuote.date >= since
+                )
+            )
+            .order_by(DailyQuote.date)
+        ).scalars():
+            prices[q.date] = {
+                'close': float(q.close),
+                'pct': ((float(q.close) - float(q.open)) / float(q.open) * 100) if q.open > 0 else 0
+            }
+    
+    if not prices:
+        return {'ok': False, 'msg': '缺少股价数据'}
+    
+    # 计算相关性（简化版）
+    dates = sorted(set(daily_sentiment.keys()) & set(prices.keys()))
+    
+    if len(dates) < 10:
+        return {'ok': False, 'msg': '数据点不足'}
+    
+    sentiment_scores = []
+    price_changes = []
+    
+    for date in dates:
+        s_data = daily_sentiment[date]
+        bullish_ratio = s_data['bullish'] / s_data['total'] * 100 if s_data['total'] > 0 else 50
+        sentiment_scores.append(bullish_ratio)
+        
+        price_changes.append(prices[date]['pct'])
+    
+    # 计算相关系数（简化版）
+    import numpy as np
+    if len(sentiment_scores) > 1:
+        correlation = np.corrcoef(sentiment_scores, price_changes)[0, 1]
+    else:
+        correlation = 0
+    
+    return {
+        'ok': True,
+        'code': code,
+        'days': days,
+        'data_points': len(dates),
+        'correlation': round(float(correlation), 3),
+        'interpretation': _interpret_correlation(correlation),
+        'dates': [d.isoformat() for d in dates],
+        'sentiment_scores': [round(s, 2) for s in sentiment_scores],
+        'price_changes': [round(p, 2) for p in price_changes]
+    }
+
+
+def _interpret_correlation(corr: float) -> str:
+    """解释相关系数"""
+    if corr > 0.7:
+        return '强正相关：情绪高涨时股价往往上涨'
+    elif corr > 0.3:
+        return '中度正相关：情绪与股价有一定同步性'
+    elif corr > -0.3:
+        return '弱相关：情绪与股价关系不明显'
+    elif corr > -0.7:
+        return '中度负相关：情绪高涨时股价反而下跌（反向指标）'
+    else:
+        return '强负相关：典型反向指标，情绪越乐观越要警惕'
+
+
+def get_keyword_cloud(days: int = 7, top_n: int = 50) -> Dict[str, Any]:
+    """获取关键词云数据
+    
+    Args:
+        days: 统计天数
+        top_n: 返回前N个关键词
+    
+    Returns:
+        词云数据
+    """
+    since = dt.datetime.now() - dt.timedelta(days=days)
+    
+    with get_session() as s:
+        posts = s.execute(
+            select(SocialPost)
+            .where(SocialPost.created_at >= since)
+        ).scalars().all()
+        
+        if not posts:
+            return {'ok': False, 'msg': '暂无数据'}
+        
+        # 收集所有关键词
+        all_keywords = []
+        for p in posts:
+            if p.keywords:
+                all_keywords.extend(p.keywords.split(','))
+        
+        # 统计词频
+        keyword_counter = Counter(all_keywords)
+        
+        # 格式化为词云数据
+        wordcloud_data = [
+            {'name': kw, 'value': cnt}
+            for kw, cnt in keyword_counter.most_common(top_n)
+        ]
+    
+    return {
+        'ok': True,
+        'days': days,
+        'keywords': wordcloud_data
+    }
+