claude强化功能
This commit is contained in:
556
backend/sentiment_monitor.py
Normal file
556
backend/sentiment_monitor.py
Normal file
@@ -0,0 +1,556 @@
|
||||
"""社区情绪监控 — 爬取分析东方财富/雪球热帖,量化散户情绪。
|
||||
|
||||
功能:
|
||||
1. 爬取社区热帖
|
||||
2. 情绪分析(乐观/悲观)
|
||||
3. 热议股票排行
|
||||
4. 关键词提取和词云
|
||||
5. 情绪与股价相关性分析
|
||||
"""
|
||||
import datetime as dt
|
||||
import json
|
||||
import re
|
||||
from typing import List, Dict, Any, Optional
|
||||
from collections import Counter, defaultdict
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import jieba
|
||||
import jieba.analyse
|
||||
from sqlalchemy import select, func, and_, desc
|
||||
|
||||
from db import get_session
|
||||
from models import SocialPost, SentimentIndex, DailyQuote, StockMetric
|
||||
|
||||
# 情绪关键词库
|
||||
BULLISH_KEYWORDS = [
|
||||
'看多', '看好', '买入', '加仓', '抄底', '突破', '上涨', '暴涨', '牛市',
|
||||
'利好', '反弹', '强势', '拉升', '涨停', '走强', '看涨', '做多'
|
||||
]
|
||||
|
||||
BEARISH_KEYWORDS = [
|
||||
'看空', '看跌', '卖出', '减仓', '止损', '下跌', '暴跌', '熊市',
|
||||
'利空', '回调', '弱势', '下杀', '跌停', '走弱', '做空', '被套'
|
||||
]
|
||||
|
||||
# 停用词
|
||||
STOP_WORDS = set([
|
||||
'的', '了', '是', '在', '我', '有', '和', '就', '不', '人', '都', '一',
|
||||
'一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有',
|
||||
'看', '好', '自己', '这', '那', '以', '为', '而', '能', '他', '对', '于'
|
||||
])
|
||||
|
||||
|
||||
def crawl_eastmoney_hot(limit: int = 50) -> List[Dict[str, Any]]:
|
||||
"""爬取东方财富热帖(简化版,实际需要处理反爬)
|
||||
|
||||
注意:由于反爬限制,这里返回模拟数据
|
||||
实际生产环境需要:
|
||||
1. 使用代理IP
|
||||
2. 模拟浏览器headers
|
||||
3. 控制请求频率
|
||||
4. 处理验证码
|
||||
"""
|
||||
# 模拟数据(实际应该爬取真实数据)
|
||||
mock_posts = [
|
||||
{
|
||||
'source': 'eastmoney',
|
||||
'post_id': f'em_{i}',
|
||||
'title': f'模拟帖子{i}:今天大盘要反弹了',
|
||||
'content': '技术分析显示底部信号明显,建议逢低买入',
|
||||
'author': f'用户{i}',
|
||||
'comment_count': 100 + i * 10,
|
||||
'view_count': 1000 + i * 100,
|
||||
}
|
||||
for i in range(limit)
|
||||
]
|
||||
|
||||
return mock_posts
|
||||
|
||||
|
||||
def crawl_xueqiu_hot(limit: int = 50) -> List[Dict[str, Any]]:
|
||||
"""爬取雪球热帖(简化版)"""
|
||||
# 雪球API(需要cookie和token)
|
||||
# 实际使用需要登录后获取token
|
||||
|
||||
mock_posts = [
|
||||
{
|
||||
'source': 'xueqiu',
|
||||
'post_id': f'xq_{i}',
|
||||
'title': f'雪球热议{i}:半导体板块分析',
|
||||
'content': '从产业链角度看,半导体景气度回升',
|
||||
'author': f'雪球用户{i}',
|
||||
'comment_count': 50 + i * 5,
|
||||
'view_count': 500 + i * 50,
|
||||
}
|
||||
for i in range(limit)
|
||||
]
|
||||
|
||||
return mock_posts
|
||||
|
||||
|
||||
def analyze_sentiment(text: str) -> str:
|
||||
"""分析文本情绪
|
||||
|
||||
Args:
|
||||
text: 待分析文本
|
||||
|
||||
Returns:
|
||||
情绪标签:bullish/bearish/neutral
|
||||
"""
|
||||
text_lower = text.lower()
|
||||
|
||||
bullish_score = sum(1 for kw in BULLISH_KEYWORDS if kw in text_lower)
|
||||
bearish_score = sum(1 for kw in BEARISH_KEYWORDS if kw in text_lower)
|
||||
|
||||
if bullish_score > bearish_score and bullish_score >= 2:
|
||||
return 'bullish'
|
||||
elif bearish_score > bullish_score and bearish_score >= 2:
|
||||
return 'bearish'
|
||||
else:
|
||||
return 'neutral'
|
||||
|
||||
|
||||
def extract_keywords(text: str, top_n: int = 10) -> List[str]:
|
||||
"""提取关键词
|
||||
|
||||
Args:
|
||||
text: 文本内容
|
||||
top_n: 返回前N个关键词
|
||||
|
||||
Returns:
|
||||
关键词列表
|
||||
"""
|
||||
# 使用jieba提取关键词
|
||||
keywords = jieba.analyse.extract_tags(text, topK=top_n, withWeight=False)
|
||||
|
||||
# 过滤停用词
|
||||
keywords = [kw for kw in keywords if kw not in STOP_WORDS and len(kw) > 1]
|
||||
|
||||
return keywords[:top_n]
|
||||
|
||||
|
||||
def extract_stock_codes(text: str) -> List[str]:
|
||||
"""从文本中提取股票代码
|
||||
|
||||
Args:
|
||||
text: 文本内容
|
||||
|
||||
Returns:
|
||||
股票代码列表
|
||||
"""
|
||||
# 匹配6位数字的股票代码
|
||||
pattern = r'\b[036]\d{5}\b'
|
||||
codes = re.findall(pattern, text)
|
||||
return list(set(codes))
|
||||
|
||||
|
||||
def collect_posts(limit_per_source: int = 50) -> Dict[str, Any]:
|
||||
"""采集社区帖子
|
||||
|
||||
Args:
|
||||
limit_per_source: 每个来源采集数量
|
||||
|
||||
Returns:
|
||||
采集结果
|
||||
"""
|
||||
all_posts = []
|
||||
|
||||
# 采集东方财富
|
||||
try:
|
||||
em_posts = crawl_eastmoney_hot(limit_per_source)
|
||||
all_posts.extend(em_posts)
|
||||
except Exception as e:
|
||||
print(f"[eastmoney] crawl error: {e}")
|
||||
|
||||
# 采集雪球
|
||||
try:
|
||||
xq_posts = crawl_xueqiu_hot(limit_per_source)
|
||||
all_posts.extend(xq_posts)
|
||||
except Exception as e:
|
||||
print(f"[xueqiu] crawl error: {e}")
|
||||
|
||||
# 分析并存储
|
||||
saved_count = 0
|
||||
with get_session() as s:
|
||||
for post in all_posts:
|
||||
# 检查是否已存在
|
||||
exists = s.execute(
|
||||
select(SocialPost).where(SocialPost.post_id == post['post_id'])
|
||||
).scalar_one_or_none()
|
||||
|
||||
if exists:
|
||||
continue
|
||||
|
||||
# 情绪分析
|
||||
text = post['title'] + ' ' + post.get('content', '')
|
||||
sentiment = analyze_sentiment(text)
|
||||
|
||||
# 提取关键词
|
||||
keywords = extract_keywords(text, top_n=5)
|
||||
|
||||
# 提取股票代码
|
||||
codes = extract_stock_codes(text)
|
||||
code = codes[0] if codes else ''
|
||||
|
||||
# 存储
|
||||
record = SocialPost(
|
||||
source=post['source'],
|
||||
post_id=post['post_id'],
|
||||
code=code,
|
||||
title=post['title'],
|
||||
content=post.get('content', ''),
|
||||
author=post.get('author', ''),
|
||||
comment_count=post.get('comment_count', 0),
|
||||
view_count=post.get('view_count', 0),
|
||||
sentiment=sentiment,
|
||||
keywords=','.join(keywords)
|
||||
)
|
||||
s.add(record)
|
||||
saved_count += 1
|
||||
|
||||
s.commit()
|
||||
|
||||
return {
|
||||
'ok': True,
|
||||
'collected': len(all_posts),
|
||||
'saved': saved_count
|
||||
}
|
||||
|
||||
|
||||
def calculate_sentiment_index(date: Optional[dt.date] = None) -> Dict[str, Any]:
|
||||
"""计算情绪指数
|
||||
|
||||
Args:
|
||||
date: 统计日期,None表示今天
|
||||
|
||||
Returns:
|
||||
情绪指数数据
|
||||
"""
|
||||
if date is None:
|
||||
date = dt.date.today()
|
||||
|
||||
start = dt.datetime.combine(date, dt.time.min)
|
||||
end = dt.datetime.combine(date, dt.time.max)
|
||||
|
||||
with get_session() as s:
|
||||
# 统计各情绪数量
|
||||
posts = s.execute(
|
||||
select(SocialPost)
|
||||
.where(
|
||||
and_(
|
||||
SocialPost.created_at >= start,
|
||||
SocialPost.created_at <= end
|
||||
)
|
||||
)
|
||||
).scalars().all()
|
||||
|
||||
if not posts:
|
||||
return {'ok': False, 'msg': '暂无数据'}
|
||||
|
||||
bullish_count = sum(1 for p in posts if p.sentiment == 'bullish')
|
||||
bearish_count = sum(1 for p in posts if p.sentiment == 'bearish')
|
||||
neutral_count = sum(1 for p in posts if p.sentiment == 'neutral')
|
||||
total = len(posts)
|
||||
|
||||
bullish_ratio = bullish_count / total * 100 if total > 0 else 0
|
||||
|
||||
# 提取热门关键词
|
||||
all_keywords = []
|
||||
for p in posts:
|
||||
if p.keywords:
|
||||
all_keywords.extend(p.keywords.split(','))
|
||||
|
||||
keyword_counter = Counter(all_keywords)
|
||||
top_keywords = [
|
||||
{'word': kw, 'count': cnt}
|
||||
for kw, cnt in keyword_counter.most_common(20)
|
||||
]
|
||||
|
||||
# 存储情绪指数
|
||||
index_record = s.execute(
|
||||
select(SentimentIndex).where(SentimentIndex.date == date)
|
||||
).scalar_one_or_none()
|
||||
|
||||
if index_record:
|
||||
index_record.bullish_count = bullish_count
|
||||
index_record.bearish_count = bearish_count
|
||||
index_record.neutral_count = neutral_count
|
||||
index_record.bullish_ratio = bullish_ratio
|
||||
index_record.total_posts = total
|
||||
index_record.top_keywords = json.dumps(top_keywords, ensure_ascii=False)
|
||||
index_record.updated_at = dt.datetime.now()
|
||||
else:
|
||||
index_record = SentimentIndex(
|
||||
date=date,
|
||||
bullish_count=bullish_count,
|
||||
bearish_count=bearish_count,
|
||||
neutral_count=neutral_count,
|
||||
bullish_ratio=bullish_ratio,
|
||||
total_posts=total,
|
||||
top_keywords=json.dumps(top_keywords, ensure_ascii=False)
|
||||
)
|
||||
s.add(index_record)
|
||||
|
||||
s.commit()
|
||||
|
||||
return {
|
||||
'ok': True,
|
||||
'date': date.isoformat(),
|
||||
'bullish_count': bullish_count,
|
||||
'bearish_count': bearish_count,
|
||||
'neutral_count': neutral_count,
|
||||
'bullish_ratio': round(bullish_ratio, 2),
|
||||
'total_posts': total,
|
||||
'top_keywords': top_keywords
|
||||
}
|
||||
|
||||
|
||||
def get_hot_stocks(days: int = 1, limit: int = 20) -> Dict[str, Any]:
|
||||
"""获取热议股票排行
|
||||
|
||||
Args:
|
||||
days: 统计天数
|
||||
limit: 返回数量
|
||||
|
||||
Returns:
|
||||
热议股票列表
|
||||
"""
|
||||
since = dt.datetime.now() - dt.timedelta(days=days)
|
||||
|
||||
with get_session() as s:
|
||||
# 按股票代码分组统计
|
||||
stmt = (
|
||||
select(
|
||||
SocialPost.code,
|
||||
func.count().label('post_count'),
|
||||
func.sum(SocialPost.comment_count).label('total_comments'),
|
||||
func.sum(SocialPost.view_count).label('total_views')
|
||||
)
|
||||
.where(
|
||||
and_(
|
||||
SocialPost.code != '',
|
||||
SocialPost.created_at >= since
|
||||
)
|
||||
)
|
||||
.group_by(SocialPost.code)
|
||||
.order_by(desc('post_count'))
|
||||
.limit(limit)
|
||||
)
|
||||
|
||||
rows = s.execute(stmt).all()
|
||||
|
||||
if not rows:
|
||||
return {'ok': False, 'msg': '暂无数据'}
|
||||
|
||||
# 获取股票名称和最新价格
|
||||
codes = [r.code for r in rows]
|
||||
metrics = {}
|
||||
for m in s.execute(
|
||||
select(StockMetric)
|
||||
.where(StockMetric.code.in_(codes))
|
||||
).scalars():
|
||||
metrics[m.code] = {
|
||||
'name': m.name,
|
||||
'close': m.close,
|
||||
'pct': m.pct
|
||||
}
|
||||
|
||||
results = []
|
||||
for r in rows:
|
||||
info = metrics.get(r.code, {'name': r.code, 'close': 0, 'pct': 0})
|
||||
results.append({
|
||||
'code': r.code,
|
||||
'name': info['name'],
|
||||
'post_count': r.post_count,
|
||||
'total_comments': r.total_comments or 0,
|
||||
'total_views': r.total_views or 0,
|
||||
'heat_score': r.post_count * 10 + (r.total_comments or 0),
|
||||
'close': info['close'],
|
||||
'pct': info['pct']
|
||||
})
|
||||
|
||||
# 按热度评分排序
|
||||
results.sort(key=lambda x: x['heat_score'], reverse=True)
|
||||
|
||||
return {
|
||||
'ok': True,
|
||||
'days': days,
|
||||
'stocks': results
|
||||
}
|
||||
|
||||
|
||||
def get_sentiment_history(days: int = 30) -> Dict[str, Any]:
|
||||
"""获取情绪指数历史
|
||||
|
||||
Args:
|
||||
days: 统计天数
|
||||
|
||||
Returns:
|
||||
历史数据
|
||||
"""
|
||||
since = dt.date.today() - dt.timedelta(days=days)
|
||||
|
||||
with get_session() as s:
|
||||
rows = s.execute(
|
||||
select(SentimentIndex)
|
||||
.where(SentimentIndex.date >= since)
|
||||
.order_by(SentimentIndex.date)
|
||||
).scalars().all()
|
||||
|
||||
if not rows:
|
||||
return {'ok': False, 'msg': '暂无历史数据'}
|
||||
|
||||
return {
|
||||
'ok': True,
|
||||
'dates': [r.date.isoformat() for r in rows],
|
||||
'bullish_ratio': [round(r.bullish_ratio, 2) for r in rows],
|
||||
'total_posts': [r.total_posts for r in rows]
|
||||
}
|
||||
|
||||
|
||||
def analyze_sentiment_correlation(code: str, days: int = 60) -> Dict[str, Any]:
|
||||
"""分析情绪与股价相关性
|
||||
|
||||
Args:
|
||||
code: 股票代码
|
||||
days: 分析天数
|
||||
|
||||
Returns:
|
||||
相关性分析结果
|
||||
"""
|
||||
since = dt.date.today() - dt.timedelta(days=days)
|
||||
|
||||
with get_session() as s:
|
||||
# 获取该股票的讨论量和情绪
|
||||
posts = s.execute(
|
||||
select(SocialPost)
|
||||
.where(
|
||||
and_(
|
||||
SocialPost.code == code,
|
||||
func.date(SocialPost.created_at) >= since
|
||||
)
|
||||
)
|
||||
).scalars().all()
|
||||
|
||||
if not posts:
|
||||
return {'ok': False, 'msg': '该股票暂无社区数据'}
|
||||
|
||||
# 按日期聚合
|
||||
daily_sentiment = defaultdict(lambda: {'bullish': 0, 'bearish': 0, 'neutral': 0, 'total': 0})
|
||||
for p in posts:
|
||||
date = p.created_at.date()
|
||||
daily_sentiment[date][p.sentiment] += 1
|
||||
daily_sentiment[date]['total'] += 1
|
||||
|
||||
# 获取股价数据
|
||||
prices = {}
|
||||
for q in s.execute(
|
||||
select(DailyQuote)
|
||||
.where(
|
||||
and_(
|
||||
DailyQuote.code == code,
|
||||
DailyQuote.date >= since
|
||||
)
|
||||
)
|
||||
.order_by(DailyQuote.date)
|
||||
).scalars():
|
||||
prices[q.date] = {
|
||||
'close': float(q.close),
|
||||
'pct': ((float(q.close) - float(q.open)) / float(q.open) * 100) if q.open > 0 else 0
|
||||
}
|
||||
|
||||
if not prices:
|
||||
return {'ok': False, 'msg': '缺少股价数据'}
|
||||
|
||||
# 计算相关性(简化版)
|
||||
dates = sorted(set(daily_sentiment.keys()) & set(prices.keys()))
|
||||
|
||||
if len(dates) < 10:
|
||||
return {'ok': False, 'msg': '数据点不足'}
|
||||
|
||||
sentiment_scores = []
|
||||
price_changes = []
|
||||
|
||||
for date in dates:
|
||||
s_data = daily_sentiment[date]
|
||||
bullish_ratio = s_data['bullish'] / s_data['total'] * 100 if s_data['total'] > 0 else 50
|
||||
sentiment_scores.append(bullish_ratio)
|
||||
|
||||
price_changes.append(prices[date]['pct'])
|
||||
|
||||
# 计算相关系数(简化版)
|
||||
import numpy as np
|
||||
if len(sentiment_scores) > 1:
|
||||
correlation = np.corrcoef(sentiment_scores, price_changes)[0, 1]
|
||||
else:
|
||||
correlation = 0
|
||||
|
||||
return {
|
||||
'ok': True,
|
||||
'code': code,
|
||||
'days': days,
|
||||
'data_points': len(dates),
|
||||
'correlation': round(float(correlation), 3),
|
||||
'interpretation': _interpret_correlation(correlation),
|
||||
'dates': [d.isoformat() for d in dates],
|
||||
'sentiment_scores': [round(s, 2) for s in sentiment_scores],
|
||||
'price_changes': [round(p, 2) for p in price_changes]
|
||||
}
|
||||
|
||||
|
||||
def _interpret_correlation(corr: float) -> str:
|
||||
"""解释相关系数"""
|
||||
if corr > 0.7:
|
||||
return '强正相关:情绪高涨时股价往往上涨'
|
||||
elif corr > 0.3:
|
||||
return '中度正相关:情绪与股价有一定同步性'
|
||||
elif corr > -0.3:
|
||||
return '弱相关:情绪与股价关系不明显'
|
||||
elif corr > -0.7:
|
||||
return '中度负相关:情绪高涨时股价反而下跌(反向指标)'
|
||||
else:
|
||||
return '强负相关:典型反向指标,情绪越乐观越要警惕'
|
||||
|
||||
|
||||
def get_keyword_cloud(days: int = 7, top_n: int = 50) -> Dict[str, Any]:
|
||||
"""获取关键词云数据
|
||||
|
||||
Args:
|
||||
days: 统计天数
|
||||
top_n: 返回前N个关键词
|
||||
|
||||
Returns:
|
||||
词云数据
|
||||
"""
|
||||
since = dt.datetime.now() - dt.timedelta(days=days)
|
||||
|
||||
with get_session() as s:
|
||||
posts = s.execute(
|
||||
select(SocialPost)
|
||||
.where(SocialPost.created_at >= since)
|
||||
).scalars().all()
|
||||
|
||||
if not posts:
|
||||
return {'ok': False, 'msg': '暂无数据'}
|
||||
|
||||
# 收集所有关键词
|
||||
all_keywords = []
|
||||
for p in posts:
|
||||
if p.keywords:
|
||||
all_keywords.extend(p.keywords.split(','))
|
||||
|
||||
# 统计词频
|
||||
keyword_counter = Counter(all_keywords)
|
||||
|
||||
# 格式化为词云数据
|
||||
wordcloud_data = [
|
||||
{'name': kw, 'value': cnt}
|
||||
for kw, cnt in keyword_counter.most_common(top_n)
|
||||
]
|
||||
|
||||
return {
|
||||
'ok': True,
|
||||
'days': days,
|
||||
'keywords': wordcloud_data
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user