tako-copilotkit/agents/python/src/lib/chat.py at main · TakoData/tako-copilotkit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
"""Chat Node"""

import logging
from typing import List, Literal, cast

from copilotkit.langgraph import copilotkit_customize_config, copilotkit_emit_state
from langchain.tools import tool
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
from langchain_core.runnables import RunnableConfig
from langgraph.types import Command

from src.lib.download import get_resource
from src.lib.model import get_model
from src.lib.state import AgentState, DataQuestion
from src.lib.mcp_integration import (
    get_visualization_iframe
)

logger = logging.getLogger(__name__)


# Feature toggles
ENABLE_DEEP_QUERIES = False


@tool
def Search(queries: List[str]):  # pylint: disable=invalid-name,unused-argument
    """A list of one or more search queries to find good resources to support the research."""


@tool
def WriteReport(report: str):  # pylint: disable=invalid-name,unused-argument
    """Write the research report."""


@tool
def WriteResearchQuestion(research_question: str):  # pylint: disable=invalid-name,unused-argument
    """Write the research question."""


@tool
def DeleteResources(urls: List[str]):  # pylint: disable=invalid-name,unused-argument
    """Delete the URLs from the resources."""


@tool
def GenerateDataQuestions(questions: List[DataQuestion]):  # pylint: disable=invalid-name,unused-argument
    """
    Generate 3-6 data-focused questions to search Tako's knowledge base.

    Create a diverse set of questions:
    - 2-4 basic questions (search_effort='fast') for straightforward data lookups AND superlative/ranking queries
    - 0-1 prediction market questions (search_effort='deep') about forecasts, probabilities, or future outcomes

    CRITICAL - Generate ATOMIC, non-compound queries:
    - Each query should ask for ONE metric/dimension only
    - NEVER combine multiple metrics in a single query (e.g., "compare X and Y", "X vs Y", "X and Y trends")
    - Instead, split compound questions into separate atomic queries

    BAD (compound):
    - "Compare San Francisco fentanyl consumption and population"
    - "China GDP vs inflation"
    - "Unemployment and wage growth in the US"

    GOOD (atomic):
    - "San Francisco fentanyl consumption"
    - "San Francisco population"
    - "China GDP"
    - "China inflation"
    - "US unemployment rate"
    - "US wage growth"

    Superlative/ranking queries are fine (these are atomic):
    - "Which countries have the highest GDP per capita?"
    - "Which cities have the highest rent?"
    - "Top 10 companies by market cap"

    Example:
    [
        {"question": "China GDP since 1960", "search_effort": "fast", "query_type": "basic"},
        {"question": "China inflation rate", "search_effort": "fast", "query_type": "basic"},
        {"question": "Which countries have the highest inflation rates in 2024?", "search_effort": "fast", "query_type": "basic"},
        {"question": "What are prediction market odds for China invading Taiwan in 2025?", "search_effort": "deep", "query_type": "prediction_market"}
    ]
    """


async def chat_node(
    state: AgentState, config: RunnableConfig
) -> Command[Literal["search_node", "chat_node", "delete_node", "__end__"]]:
    """
    Chat Node
    """
    logger.info("=== CHAT_NODE: Starting execution ===")

    # Note: report is NOT in emit_intermediate_state to prevent flicker
    # The report is only emitted once charts are injected
    config = copilotkit_customize_config(
        config,
        emit_intermediate_state=[
            {
                "state_key": "research_question",
                "tool": "WriteResearchQuestion",
                "tool_argument": "research_question",
            },
            {
                "state_key": "data_questions",
                "tool": "GenerateDataQuestions",
                "tool_argument": "questions",
            },
        ],
    )

    state["resources"] = state.get("resources", [])
    research_question = state.get("research_question", "")
    report = state.get("report", "")

    resources = []
    tako_charts_map = {}
    available_tako_charts = []

    for resource in state["resources"]:
        # Tako charts - use stored description as content
        if resource.get("resource_type") == "tako_chart":
            title = resource.get("title", "")
            card_id = resource.get("card_id")  # Changed from pub_id
            embed_url = resource.get("embed_url")
            description = resource.get("description", "")

            # Add to resources with description as content
            resources.append({
                **resource,
                "content": description
            })

            # Build Tako charts map for post-processing (generate iframe on demand)
            if title and (card_id or embed_url):
                # Store card_id/embed_url for later iframe generation
                tako_charts_map[title] = {"card_id": card_id, "embed_url": embed_url}
                available_tako_charts.append(f"  - **{title}**\n    Description: {description}")
        else:
            # Web resources: use pre-stored Tavily summary (no download needed)
            content = resource.get("content", "")
            if not content:
                # Fallback: download if content is missing (shouldn't happen normally)
                content = get_resource(resource["url"])
                if content == "ERROR":
                    continue
            resources.append({**resource, "content": content})

    available_tako_charts_str = "\n".join(available_tako_charts) if available_tako_charts else "  (No Tako charts available yet)"

    logger.info(f"Built tako_charts_map with {len(tako_charts_map)} charts")
    logger.info(f"Chart titles: {list(tako_charts_map.keys())}")

    model = get_model(state)
    # Prepare the kwargs for the ainvoke method
    ainvoke_kwargs = {}
    if model.__class__.__name__ in ["ChatOpenAI"]:
        ainvoke_kwargs["parallel_tool_calls"] = False

    # Build dynamic prompt based on feature toggles
    if ENABLE_DEEP_QUERIES:
        data_questions_instructions = """2. THEN: Use GenerateDataQuestions to create 3-6 data-focused questions with varied complexity:
               - Generate ATOMIC queries - each query asks for ONE metric/dimension only
               - 2-3 BASIC questions (fast search) for straightforward data: "Country X GDP 2020-2024"
               - 1-2 COMPLEX questions (deep search) for analytical insights
               - 0-1 PREDICTION MARKET question (deep search) if relevant: "What are prediction market odds for X in 2025?"
               - Use the entities, metrics, cohorts, and time periods listed in the knowledge base context above when available
               - Prefer exact entity/metric names from the knowledge base context for better search results"""
    else:
        data_questions_instructions = """2. THEN: Use GenerateDataQuestions to create 3-6 data-focused questions:
               - Generate ATOMIC queries - each query asks for ONE metric/dimension only
               - Instead, split compound questions into separate atomic queries
               - Examples of GOOD atomic queries:
                 * "US GDP 2020-2024" (single metric)
                 * "US inflation rate" (single metric, separate query)
                 * "Which countries have the highest rent?" (superlative - this is atomic)
                 * "Top 10 companies by revenue" (ranking - this is atomic)
               - Examples of BAD compound queries to AVOID:
                 * "Compare US GDP and inflation" -> split into two queries
                 * "San Francisco population vs rent" -> split into two queries
               - 0-1 PREDICTION MARKET question (deep search) if relevant: "What are prediction market odds for X in 2025?"
               - Use the entities, metrics, cohorts, and time periods listed in the knowledge base context above when available
               - Prefer exact entity/metric names from the knowledge base context for better search results"""

    # Add status update for query analysis
    state["logs"] = state.get("logs", [])
    state["logs"].append({"message": "Analyzing your research query...", "done": False})
    await copilotkit_emit_state(config, state)

    response = await model.bind_tools(
        [
            Search,
            WriteReport,
            WriteResearchQuestion,
            GenerateDataQuestions,
        ],
        **ainvoke_kwargs,  # Pass the kwargs conditionally
    ).ainvoke(
        [
            SystemMessage(
                content=f"""
            You are a research assistant. You help the user with writing a research report.
            Do not recite the resources, instead use them to answer the user's question.

            {state.get("explore_context", "")}

            RESEARCH WORKFLOW:
            1. FIRST: When you receive a user's query, use WriteResearchQuestion to extract/formulate the core research question
            {data_questions_instructions}
            3. These questions will search Tako for relevant charts and visualizations
            4. Use the Search tool for web resources
            5. Write a clear, well-structured report using the data from your searches
            6. Combine insights from both Tako charts and web resources in your report

            IMPORTANT ABOUT RESEARCH QUESTION:
            - Always start by using WriteResearchQuestion to capture the user's research intent
            - This creates a clear, focused question from their natural language query
            - If a research question is already provided, YOU MUST NOT ASK FOR IT AGAIN

            AVAILABLE DATA VISUALIZATIONS ({len(tako_charts_map)} charts):
{available_tako_charts_str}

            WRITING GUIDELINES:
            - Write a COMPREHENSIVE report with substantial analysis and narrative text
            - Use the chart descriptions above AND web resources to write detailed, insightful paragraphs
            - For EACH chart, write at least 1-2 paragraphs discussing its key insights, trends, and implications
            - Structure the report so that text naturally leads into and follows from each data point
            - DO NOT include any chart markers, image syntax, or embed codes - charts will be inserted automatically
            - DO NOT use markdown image syntax like ![title](url)
            - DO NOT include external links like tradingeconomics.com
            - Focus on analysis and insights - explain WHAT the data shows and WHY it matters
            - Reference specific data points, numbers, and trends from the chart descriptions
            - Connect insights across multiple charts to tell a cohesive story

            You should use the search tool to get resources before answering the user's question.
            Use the content and descriptions from both Tako charts and web resources to inform your report.
            To write the report, you should use the WriteReport tool. Never EVER respond with the report content, only use the tool.
            After writing the report, send a brief (1-2 sentence) follow-up asking if the user wants any changes or has questions. Do NOT summarize or repeat the report content in the chat.

            This is the research question:
            {research_question}

            This is the research report:
            {report}

            Here are the resources that you have available:
            {resources}
            """
            ),
            *state["messages"],
        ],
        config,
    )

    # Mark query analysis as complete
    state["logs"][-1]["done"] = True
    await copilotkit_emit_state(config, state)

    ai_message = cast(AIMessage, response)
    if ai_message.tool_calls:
        if ai_message.tool_calls[0]["name"] == "WriteReport":
            # Add progress indicator for report generation
            state["logs"].append({"message": "Writing research report...", "done": False})
            await copilotkit_emit_state(config, state)

            report = ai_message.tool_calls[0]["args"].get("report", "")

            # Mark report writing as done
            state["logs"][-1]["done"] = True
            await copilotkit_emit_state(config, state)

            # Clean up: Remove any markdown image links that the LLM incorrectly added
            import re
            external_domains = r'(tradingeconomics|worldbank|imf|fred|ourworldindata|statista)'
            report = re.sub(rf'!\[([^\]]+)\]\(https?://[^)]*{external_domains}[^)]*\)',
                          r'', report, flags=re.IGNORECASE)

            # Remove any markdown images
            report = re.sub(r'!\[[^\]]*\]\([^)]+\)', '', report)

            # Remove any leftover chart markers (in case model still added them)
            report = re.sub(r'\[TAKO_CHART:[^\]]+\]', '', report)

            # Second pass: Inject charts at appropriate positions
            processed_report = report
            if tako_charts_map:
                state["logs"].append({"message": "Inserting data visualizations...", "done": False})
                await copilotkit_emit_state(config, state)
                # Build chart list for injection prompt
                chart_list = "\n".join([f"- {title}" for title in tako_charts_map.keys()])

                # Ask model to insert chart markers at appropriate positions
                inject_response = await model.ainvoke(
                    [
                        SystemMessage(content=f"""You are a report editor. Your task is to insert chart markers into the report at appropriate positions.

AVAILABLE CHARTS:
{chart_list}

RULES:
1. Insert [CHART:exact_title] markers where each chart would best support the text
2. Place markers AFTER the relevant paragraph (not in the middle of text)
3. Each chart should be used exactly once
4. Only use charts from the AVAILABLE CHARTS list above
5. Return the COMPLETE report with markers inserted
6. Do not modify the text content, only add markers
7. Add a blank line before and after each marker

CRITICAL PLACEMENT RULES:
8. NEVER place more than two charts consecutively - there MUST be at least one paragraph of text between any two charts
9. NEVER append multiple charts at the end of the report - distribute them throughout the text
10. Each chart should be placed IMMEDIATELY after the paragraph that discusses its specific data/topic
11. If the report doesn't have enough text to properly intersperse all charts, place charts where they're most relevant and leave remaining charts unplaced rather than clustering them

Example of GOOD placement:
The economy grew significantly in 2023...

[CHART:GDP Growth 2023]

This growth was driven by consumer spending. Meanwhile, unemployment continued its downward trend...

[CHART:Unemployment Rate 2023]

The labor market strength contributed to...

Example of BAD placement (DO NOT DO THIS):
The economy grew significantly in 2023...
This growth was driven by consumer spending...
The labor market showed improvement...

[CHART:GDP Growth 2023]

[CHART:Unemployment Rate 2023]

[CHART:Inflation Data 2023]
"""),
                        HumanMessage(content=f"Insert chart markers into this report:\n\n{report}")
                    ],
                    config
                )

                report_with_markers = inject_response.content if hasattr(inject_response, 'content') else str(inject_response)

                # Replace chart markers with actual iframe HTML
                async def replace_marker(match):
                    chart_title = match.group(1).strip()
                    chart_info = tako_charts_map.get(chart_title)

                    # Try case-insensitive match if exact match fails
                    if not chart_info:
                        for title, info in tako_charts_map.items():
                            if title.lower() == chart_title.lower():
                                chart_info = info
                                break

                    if not chart_info:
                        logger.warning(f"Chart not found: {chart_title}")
                        return ""

                    iframe_html = await get_visualization_iframe(
                        item_id=chart_info.get("card_id"),
                        embed_url=chart_info.get("embed_url")
                    )

                    if iframe_html:
                        iframe_only = re.sub(r'<script.*?</script>', '', iframe_html, flags=re.DOTALL)
                        return "\n" + iframe_only.strip() + "\n"
                    return ""

                # Find and replace all markers
                markers = list(re.finditer(r'\[CHART:([^\]]+)\]', report_with_markers))
                replacements = []
                for match in markers:
                    replacement = await replace_marker(match)
                    replacements.append((match.start(), match.end(), replacement))

                # Apply replacements in reverse order
                processed_report = report_with_markers
                for start, end, replacement in reversed(replacements):
                    processed_report = processed_report[:start] + replacement + processed_report[end:]

                logger.info(f"Injected {len([r for r in replacements if r[2]])} charts into report")

                # Mark chart injection as done
                state["logs"][-1]["done"] = True
                await copilotkit_emit_state(config, state)

            # Clear logs before showing final report
            state["logs"] = []
            await copilotkit_emit_state(config, state)

            return Command(
                goto="chat_node",
                update={
                    "report": processed_report,
                    "resources": state.get("resources", []),  # Preserve resources
                    "messages": [
                        ai_message,
                        ToolMessage(
                            tool_call_id=ai_message.tool_calls[0]["id"],
                            content="Report written successfully. Now send a brief follow-up message asking if the user wants any changes or has questions. Do NOT repeat the report content.",
                        ),
                    ],
                },
            )
        if ai_message.tool_calls[0]["name"] == "WriteResearchQuestion":
            research_question = ai_message.tool_calls[0]["args"]["research_question"]
            return Command(
                goto="chat_node",
                update={
                    "research_question": research_question,
                    "resources": state.get("resources", []),  # Preserve resources
                    "messages": [
                        ai_message,
                        ToolMessage(
                            tool_call_id=ai_message.tool_calls[0]["id"],
                            content="Research question written.",
                        ),
                    ],
                },
            )

    goto = "__end__"
    if ai_message.tool_calls:
        tool_name = ai_message.tool_calls[0]["name"]
        if tool_name == "Search":
            goto = "search_node"
        elif tool_name == "DeleteResources":
            goto = "delete_node"
        elif tool_name == "GenerateDataQuestions":
            # Store data questions and route to search
            data_questions = ai_message.tool_calls[0]["args"].get("questions", [])

            # Add status update for generated questions
            if data_questions:
                state["logs"].append({
                    "message": f"Generated {len(data_questions)} search questions",
                    "done": True
                })
                await copilotkit_emit_state(config, state)

            logger.info(f"GenerateDataQuestions: Routing to search_node with {len(data_questions)} questions")
            return Command(
                goto="search_node",
                update={
                    "data_questions": data_questions,
                    "resources": state.get("resources", []),  # Preserve resources
                    "messages": [
                        ai_message,
                        ToolMessage(
                            tool_call_id=ai_message.tool_calls[0]["id"],
                            content=f"Generated {len(data_questions)} data questions for Tako search.",
                        ),
                    ],
                },
            )

    logger.info(f"=== CHAT_NODE: Routing to {goto} ===")
    return Command(goto=goto, update={"messages": response, "resources": state.get("resources", [])})