# Run this code every time when you're actively developing modules in .py files.  It's not needed if you aren't making modules
#
## this code is necessary for making sure that any modules we load are updated here 
## when their source code .py files are modified

%load_ext autoreload
%autoreload 2

# Setup code -- Run only once after cloning!!! 
#
# this code downloads the data from its source to the `data/00-raw/` directory
# if the data hasn't updated you don't need to do this again!

# if you don't already have these packages (you should!) uncomment this line
# %pip install requests tqdm

import sys
sys.path.append('./modules') # this tells python where to look for modules to import

import get_data # this is where we get the function we need to download data
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import date, timedelta, datetime
import seaborn as sns
from modules.data_cleanup import *
import re
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from modules.hourly_data_functions import *
from modules.volatility_analysis import run_btc_volatility_analysis
from modules.crypto_volatility_module import *
from modules.ETF_volatility_functions import *

# replace the urls and filenames in this list with your actual datafiles
# yes you can use Google drive share links or whatever
# format is a list of dictionaries; 
# each dict has keys of 
#   'url' where the resource is located
#   'filename' for the local filename where it will be stored 
# datafiles = [
#     { 'url': 'https://raw.githubusercontent.com/fivethirtyeight/data/refs/heads/master/airline-safety/airline-safety.csv', 'filename':'airline-safety.csv'},
#     { 'url': 'https://raw.githubusercontent.com/fivethirtyeight/data/refs/heads/master/bad-drivers/bad-drivers.csv', 'filename':'bad-drivers.csv'}
# ]

# get_data.get_raw(datafiles,destination_directory='data/00-raw/')

import warnings
warnings.filterwarnings('ignore')

df1 = pd.read_csv("data/00-raw/crypto_marketcap_daily.csv")
df1["datetime"] = pd.to_datetime(df1["snapped_at"], unit="ms")
print(df1.head())

      snapped_at    market_cap  total_volume   datetime
0  1367193600000  1.661442e+09           0.0 2013-04-29
1  1367280000000  1.592765e+09           0.0 2013-04-30
2  1367366400000  1.378705e+09           0.0 2013-05-01
3  1367452800000  1.220763e+09           0.0 2013-05-02
4  1367539200000  1.075224e+09           0.0 2013-05-03

# Term 1: 1/1/2017-12/31/2020
term1 = df1[(df1["datetime"] >= "2017-01-01") & (df1["datetime"] < "2020-12-31")]

# Term 2: 1/1/2025-current
term2 = df1[df1["datetime"] >= "2025-01-20"]

term1.head(5)

# Plot Term 1: Market Cap
plt.figure(figsize=(12, 5))
plt.plot(term1["datetime"], term1["market_cap"], color="blue")
plt.title("Crypto Market Cap: Term 1 (2017-2020)")
plt.xlabel("Date")
plt.ylabel("Market Cap (USD)")
plt.grid(True)
plt.show()

# Plot Term 2: Market Cap
plt.figure(figsize=(12, 5))
plt.plot(term2["datetime"], term2["market_cap"], color="green")
plt.title("Crypto Market Cap: Term 2 (2025-current)")
plt.xlabel("Date")
plt.ylabel("Market Cap (USD)")
plt.grid(True)
plt.show()

df2 = pd.read_csv("data/00-raw/BTC_hourly.csv")
df2.head(4)

# Convert the 'date' column to datetime
df2["datetime"] = pd.to_datetime(df2["date"])
df2 = df2.drop(columns="date")

# Term 1: 1/1/2017 - 12/31/2020
crypto_hourly_term1 = df2[(df2["datetime"] >= "2017-01-01") & (df2["datetime"] <= "2020-12-31")]

# Term 2: 1/1/2025 - current
crypto_hourly_term2 = df2[df2["datetime"] >= "2025-01-01"]

crypto_hourly_term1.head(5)

# Plot Term 1
plt.figure(figsize=(12, 5))
plt.plot(crypto_hourly_term1["datetime"], crypto_hourly_term1["close"], color="blue")
plt.title("BTC Hourly Close Prices: Term 1 (2017-2020)")
plt.xlabel("Date")
plt.ylabel("Close Price (USD)")
plt.grid(True)
plt.show()

# Plot Term 2
plt.figure(figsize=(12, 5))
plt.plot(crypto_hourly_term2["datetime"], crypto_hourly_term2["close"], color="green")
plt.title("BTC Hourly Close Prices: Term 2 (2025-current)")
plt.xlabel("Date")
plt.ylabel("Close Price (USD)")
plt.grid(True)
plt.show()

print("Term1:")
missing_volume(crypto_hourly_term1)
print("Term2:")
missing_volume(crypto_hourly_term2)

Term1:
Missing BTC volume: 204 out of 35038 (0.58%)
Missing USD volume: 204 out of 35038 (0.58%)
Term2:
Missing BTC volume: 3 out of 7536 (0.04%)
Missing USD volume: 3 out of 7536 (0.04%)

original_posts = pd.read_csv("data/00-raw/trump_social_posts_2016_to_now.csv")
original_posts.head()

original_posts['platform'].value_counts()

platform
twitter    61995
Name: count, dtype: int64

posts_raw = pd.read_csv("./data/01-interim/posts.csv")
posts_raw.head()

platformtypes = posts_raw['platform_ignore'].value_counts()
linktypes = posts_raw['link_type'].value_counts()
print(platformtypes, linktypes)

platform_ignore
twitter    61995
Name: count, dtype: int64 link_type
post    61995
Name: count, dtype: int64

posts_clean = pd.read_csv("./data/02-processed/posts.csv")
posts_clean = posts_clean.drop(columns = "link_type")
posts_clean = posts_clean.drop(columns = "platform_ignore")
posts_clean.head()

nan_counts = posts_clean.isna().sum().to_frame(name='NaN_count')
nan_counts

nan_description = posts_clean[posts_clean['description'].isna()].copy()
posts_clean['description'] = posts_clean['description'].fillna("").astype(str)

ts_utc = pd.to_datetime(posts_clean['timestamp_et'], utc=True)
posts_clean['timestamp_et'] = ts_utc.dt.tz_convert('America/New_York')
posts_clean['date'] = posts_clean['timestamp_et'].dt.date
posts_clean['hour'] = posts_clean['timestamp_et'].dt.hour

posts_by_date = posts_clean.groupby('date').size().to_frame('post_count')
posts_by_hour = posts_clean.groupby('hour').size().to_frame('post_count')

#graph posts per hour of the day
posts_by_hour.plot(kind='bar', figsize=(8,4), title='Posts by Hour (ET)')
plt.xlabel('Hour of Day')
plt.ylabel('Count')
plt.show()

if not pd.api.types.is_datetime64_any_dtype(posts_clean['timestamp_et']):
    posts_clean['timestamp_et'] = pd.to_datetime(posts_clean['timestamp_et'], utc=True)

weekly_counts = posts_clean.set_index('timestamp_et').resample('W').size()

rolling_mean = weekly_counts.rolling(window=4).mean()

plt.figure(figsize=(12, 6))

plt.plot(rolling_mean.index, rolling_mean, label='4-Week Moving Average', color='tab:blue', linewidth=2)

plt.plot(weekly_counts.index, weekly_counts, label='Raw Weekly Counts', color='gray', alpha=0.3)

plt.title('Weekly Moving Average of Trump Social Media Posts')
plt.xlabel('Date')
plt.ylabel('Number of Posts')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

df = pd.read_csv("./data/01-interim/posts.csv")

# Parse as timezone-aware UTC datetimes
df["timestamp_et"] = pd.to_datetime(df["timestamp_et"], utc=True, errors="coerce")

# Define date range as UTC-aware timestamps
start = pd.Timestamp("2021-06-01", tz="UTC")
end   = pd.Timestamp("2022-02-01", tz="UTC")   # end is exclusive

# Filter rows in the range [start, end)
mask = (df["timestamp_et"] >= start) & (df["timestamp_et"] < end)
subset = df[mask]

print("Number of posts between June 2021 and Jan 2022:", len(subset))

Number of posts between June 2021 and Jan 2022: 0

weekly_platform = (posts_clean.groupby([pd.Grouper(key='timestamp_et', freq='W-SUN'), 'platform']).size().unstack(fill_value=0).reindex(columns=['truth', 'twitter'], fill_value=0))
weekly_platform['combined'] = weekly_platform.sum(axis=1)

ax = weekly_platform[['truth', 'twitter', 'combined']].plot(kind='line', figsize=(10, 5), title='Weekly Posts: Truth vs X (twitter) vs Combined')
ax.set_xlabel('Week Ending')
ax.set_ylabel('Posts')
plt.tight_layout()
plt.show()

posts = posts_clean.copy()
posts.head()

SPX = pd.read_csv('data/00-raw/S&P500 Data - S&P.csv')
COMP = pd.read_csv('data/00-raw/NASDAQ Data - NASDAQ.csv')
DJIA = pd.read_csv('data/00-raw/DOW - DOW.csv')
#updating column names to specify ETF 
SPX.columns = [col if col == 'Date' else 'SPX_' + col for col in SPX.columns]
COMP.columns = [col if col == 'Date' else 'COMP_' + col for col in COMP.columns]
DJIA.columns = [col if col == 'Date' else 'DJIA_' + col for col in DJIA.columns]
#merging seperate datasets into one
df4 = SPX.merge(COMP, on='Date', how='outer').merge(DJIA, on='Date', how='outer')
df4['Date'] = pd.to_datetime(df4['Date'], format = '%m/%d/%Y')
#sorting by date
df4 = df4.sort_values(by='Date')
for col in df4.columns:
    if col != 'Date':
        df4[col] = df4[col].astype(str).str.replace(',', '', regex=False).astype(float)
df4.head()

# Term 1: 1/1/2017-12/31/2020
term1_index = df4[(df4["Date"] >= datetime.strptime("2017-01-01", '%Y-%m-%d')) & (df4["Date"] < datetime.strptime("2020-12-31", '%Y-%m-%d'))]

# Term 2: 1/1/2025-current
term2_index = df4[df4["Date"] >= datetime.strptime("2025-01-20", '%Y-%m-%d')]

term1_index.head(5)

posts["posts_lc"] = posts["description"].str.lower()
posts["posts_lc"] = (posts["posts_lc"]
    .str.replace(r"http\S+", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)
posts[["description", "posts_lc"]].head()

econ_keywords = {
    "tariff": ["tariff", "tariffs"],
    "rate": ["rate", "rates", "interest rate", "hike", "cut"],
    "trade": ["trade", "china", "deal", "exports", "imports"],
    "jobs": ["jobs", "employment", "unemployment"],
    "inflation": ["inflation", "cpi", "prices", "cost of living"],
    "fed": ["fed", "federal reserve", "powell"],
    "market": ["market", "stock", "stocks", "dow", "nasdaq", "sp500", "s&p"],    
}

for cat, words in econ_keywords.items():
    pattern = r"\b(?:" + "|".join(re.escape(w) for w in words) + r")\b"  # non-capturing group
    posts[f"kw_{cat}"] = posts["posts_lc"].str.contains(pattern, regex=True)


kw_cols = [c for c in posts.columns if c.startswith("kw_")]
posts["contains_econ_keyword"] = posts[kw_cols].any(axis=1).astype(int)

posts.head(5)
print(posts.head(5))

  platform              timestamp_et  timestamp_epoch  \
0    truth 2025-11-09 17:51:18-05:00       1762728678   
1    truth 2025-11-09 16:53:22-05:00       1762725202   
2    truth 2025-11-09 16:38:31-05:00       1762724311   
3    truth 2025-11-09 16:27:19-05:00       1762723639   
4    truth 2025-11-09 16:21:39-05:00       1762723299   

                                                link  \
0  https://truthsocial.com/@realDonaldTrump/posts...   
1  https://truthsocial.com/@realDonaldTrump/posts...   
2  https://truthsocial.com/@realDonaldTrump/posts...   
3  https://truthsocial.com/@realDonaldTrump/posts...   
4  https://truthsocial.com/@realDonaldTrump/posts...   

                                         description        date  hour  \
0  RT @ NewtGingrich The New York Post report on ...  2025-11-09    17   
1                                                     2025-11-09    16   
2  https:// nypost.com/2025/10/24/us-news/ kash-p...  2025-11-09    16   
3  https:// thefederalist.com/2025/10/23/l indsey...  2025-11-09    16   
4  DHS sees biggest jump in public approval among...  2025-11-09    16   

                                            posts_lc  kw_tariff  kw_rate  \
0  rt @ newtgingrich the new york post report on ...      False    False   
1                                                         False    False   
2  nypost.com/2025/10/24/us-news/ kash-patel-skew...      False    False   
3  thefederalist.com/2025/10/23/l indsey-halligan...      False    False   
4  dhs sees biggest jump in public approval among...      False    False   

   kw_trade  kw_jobs  kw_inflation  kw_fed  kw_market  contains_econ_keyword  
0     False    False         False   False      False                      0  
1     False    False         False   False      False                      0  
2     False    False         False   False      False                      0  
3     False    False         False   False      False                      0  
4     False    False         False   False      False                      0

nltk.download("vader_lexicon")

sia = SentimentIntensityAnalyzer()

posts["sentiment_compound"] = posts["posts_lc"].apply(
    lambda x: sia.polarity_scores(x)["compound"]
)

posts["sentiment_label"] = posts["sentiment_compound"].apply(classify_sentiment)

posts[["posts_lc", "sentiment_compound", "sentiment_label"]].head(5)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kento/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!

btc_term1 = pd.read_csv("data/02-processed/btc_hourly_term1.csv")
posts = pd.read_csv("data/02-processed/posts_analyzed.csv")

posts_term1 = posts[(posts['date'] >= '2017-01-01') & (posts['date'] < '2021-01-01')].copy()

posts_term1.tail(2)

tariff_tweets = posts_term1[posts_term1['kw_tariff'] == True].copy()
tariff_tweets['date'] = pd.to_datetime(tariff_tweets['date'])
btc_term1['datetime'] = pd.to_datetime(btc_term1['datetime']).dt.tz_localize('UTC')

tariff_tweets.head(2)

btc_term1_indexed = btc_term1.set_index('datetime').sort_index()
first_timestamp = pd.to_datetime(tariff_tweets['timestamp_et'], utc=True).to_list()[0]

first_tweet_hour = first_timestamp.floor('h')
btc_price = btc_term1_indexed.loc[first_tweet_hour, 'close']

print(f"First tariff tweet timestamp (Full): {first_timestamp}")
print(f"Hour start for lookup: {first_tweet_hour}")
print(f"BTC Close Price at that hour: {btc_price}")

First tariff tweet timestamp (Full): 2020-11-22 04:54:18+00:00
Hour start for lookup: 2020-11-22 04:00:00+00:00
BTC Close Price at that hour: 18605.81

tariff_tweets['tweet_datetime_full'] = pd.to_datetime(tariff_tweets['timestamp_et'], utc=True)

# 2. Round down all tweet times to the nearest hour (t=0)
tweet_hours = tariff_tweets['tweet_datetime_full'].dt.floor('h')

# 3. Extract the BTC closing price for each rounded hour
# We use the full Series of UTC hours to lookup prices in the correctly timezone-aware btc_term1_indexed.
btc_prices_at_tweet_hour = btc_term1_indexed.loc[tweet_hours, 'close'].reset_index(drop=True)

# 4. Create the final DataFrame
hourly_prices_df = pd.DataFrame({
    'tweet_datetime_utc': tariff_tweets['tweet_datetime_full'].reset_index(drop=True),
    'tweet_hour_start_utc': tweet_hours.reset_index(drop=True),
    'btc_close_price_t0': btc_prices_at_tweet_hour
})

hourly_prices_df.head()

price_columns = [str(i) for i in range(-24, 25)]
HOURS_TO_EXTRACT = 49
tweet_timeline_data = []
reference_column = '0'

if 'tweet_datetime_full' not in tariff_tweets.columns:
    tariff_tweets['tweet_datetime_full'] = pd.to_datetime(tariff_tweets['timestamp_et'], utc=True)

for _, tweet in tariff_tweets.iterrows():
    tweet_hour_start = tweet['tweet_datetime_full'].floor('h')
    series_start_time = tweet_hour_start - pd.Timedelta(hours=24)

    start_pos = btc_term1_indexed.index.get_loc(series_start_time)

    # Exclude posts whose window would lie outside of the term.
    if start_pos + HOURS_TO_EXTRACT <= len(btc_term1_indexed):
        price_series = btc_term1_indexed['close'].iloc[start_pos : start_pos + HOURS_TO_EXTRACT]

        row_data = {
            'link': tweet['link'],
            'tweet_datetime_utc': tweet['tweet_datetime_full']
        }
        row_data.update(dict(zip(price_columns, price_series.values)))
        tweet_timeline_data.append(row_data)

# 3. CREATE DataFrame: tweet_timeline_term1
final_columns = ['link', 'tweet_datetime_utc'] + price_columns
tweet_timeline_term1 = pd.DataFrame(tweet_timeline_data, columns=final_columns)

tweet_timeline_term1[price_columns] = tweet_timeline_term1[price_columns].apply(pd.to_numeric, errors='coerce')

price_t0 = tweet_timeline_term1[[reference_column]].values 

price_data = tweet_timeline_term1[price_columns].values
percentage_change_data = ((price_data / price_t0) - 1) * 100

tweet_timeline_term1[price_columns] = percentage_change_data
tweet_timeline_term1[reference_column] = 0.0

new_column_names = {col: f'pct_change_{col}' for col in price_columns}
tweet_timeline_term1.rename(columns=new_column_names, inplace=True)

tweet_timeline_term1.head()

pct_change_columns = [f'pct_change_{i}' for i in range(-24, 25)]

# 2. Calculate the average percentage change across all tweets for each hour
avg_change = tweet_timeline_term1[pct_change_columns].mean()

# 3. Wrangle the data into a clean DataFrame for plotting
plot_data = avg_change.reset_index()
plot_data.columns = ['Hour_String', 'Avg_Pct_Change']
plot_data['Hour'] = plot_data['Hour_String'].str.split('_').str[-1].astype(int)
plot_data = plot_data.sort_values(by='Hour')

# 4. Generate the plot (using a concise format as requested)
plt.figure(figsize=(10, 5))
plt.plot(plot_data['Hour'], plot_data['Avg_Pct_Change'], marker='.', linewidth=2, color='#F7931A') # BTC Orange

# Add formatting for better readability
plt.axhline(0, color='gray', linestyle='--', linewidth=0.8)
plt.axvline(0, color='red', linestyle='-', linewidth=1, label='Tweet Hour (t=0)')
plt.title('Average BTC Price Percentage Change Relative to Tariff-Related Tweets (term1)')
plt.xlabel('Hours Since Tweet')
plt.ylabel('Average Percentage Change (%)')
plt.grid(True, linestyle=':', alpha=0.6)
plt.xticks(range(-24, 25, 4))
plt.legend()
plt.show()

plt_object, timeline_df = analyze_btc_impact_auto(
    term="term2", 
    keyword_category="tariff", 
    time_range_hours=24
)

run_full_btc_impact_grid(
    analyze_func=analyze_btc_impact_auto,
    terms=["term1", "term2"],
    categories=["tariff", "rate", "trade", "jobs", "inflation", "fed", "market", "econ"],
    time_range=72,
    y_min=-2,
    y_max=2
)

run_btc_analysis()

run_btc_volatility_analysis()

crypto_term1 = pd.read_csv('data/02-processed/crypto_term1.csv')
crypto_term2 = pd.read_csv('data/02-processed/crypto_term2.csv')
crypto_term1.head(3)

crypto_term2.head(3)

results = run_keyword_vol_analysis(
    "data/02-processed/crypto_term1.csv",
    "data/02-processed/crypto_term2.csv",
    posts_path="data/02-processed/posts_analyzed.csv",
    window=(-5,5)
)

results = run_keyword_vol_analysis(
    term1_path='data/02-processed/crypto_term1.csv',
    term2_path='data/02-processed/crypto_term2.csv',
    window=(-2,2),
    plot=False
)

significance = test_post_event_volatility(results, pre_window=(-2,-1), post_window=(0,2))
for kw, stats in significance.items():
    print(f"Keyword: {kw}")
    for term, vals in stats.items():
        print(f"  {term.capitalize()}: t={vals['t_stat']:.2f}, p={vals['p_value']:.3f}")

Keyword: kw_tariff
  Term1: t=3.34, p=0.045
  Term2: t=1.89, p=0.156
Keyword: kw_rate
  Term1: t=0.70, p=0.558
  Term2: t=3.07, p=0.150
Keyword: kw_trade
  Term1: t=1.20, p=0.320
  Term2: t=0.21, p=0.849
Keyword: kw_jobs
  Term1: t=0.34, p=0.755
  Term2: t=1.70, p=0.190
Keyword: kw_inflation
  Term1: t=-0.41, p=0.727
  Term2: t=5.30, p=0.105
Keyword: kw_fed
  Term1: t=2.48, p=0.122
  Term2: t=5.10, p=0.015
Keyword: kw_market
  Term1: t=0.12, p=0.915
  Term2: t=2.40, p=0.105

df = pd.read_csv('./data/02-processed/posts_analyzed.csv')

df['timestamp_et'] = pd.to_datetime(df['timestamp_et'], utc=True, errors='coerce')

weekly_stats = df.set_index('timestamp_et').resample('W')['sentiment_compound'].agg(['mean', 'count'])

weekly_stats = weekly_stats[weekly_stats['count'] > 0]

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
ax1.plot(weekly_stats.index, weekly_stats['mean'], label='Weekly Avg Sentiment', color='purple', alpha=0.5)

rolling_sentiment = weekly_stats['mean'].rolling(window=4).mean()
ax1.plot(rolling_sentiment.index, rolling_sentiment, label='4-Week Moving Average', color='darkorange', linewidth=2)

ax1.set_title('Weekly Average Sentiment of Trump Posts (Active Weeks Only)', fontsize=14)
ax1.set_xlabel('Date', fontsize=12)
ax1.set_ylabel('Compound Sentiment Score (-1 to 1)', fontsize=12)
ax1.axhline(0, color='black', linewidth=1, linestyle='-', alpha=0.5) 
ax1.legend()

sns.histplot(df['sentiment_compound'], bins=30, ax=ax2, kde=True, color='purple')
ax2.set_title('Distribution of Individual Post Sentiment Scores', fontsize=14)
ax2.set_xlabel('Sentiment Compound Score (Negative < 0 < Positive)', fontsize=12)
ax2.set_ylabel('Frequency', fontsize=12)

plt.tight_layout()
plt.show()

print("-" * 80)
print("EXAMPLES OF TWEETS BY SENTIMENT SCORE")
print("-" * 80)

idx_pos = df['sentiment_compound'].idxmax()
idx_neg = df['sentiment_compound'].idxmin()

# 1. Highly Positive Example
print(f"\n[+] HIGHLY POSITIVE (Score: {df.loc[idx_pos, 'sentiment_compound']}):")
print(f"\"{df.loc[idx_pos, 'description']}\"")

# 3. Highly Negative Example
print(f"\n[-] HIGHLY NEGATIVE (Score: {df.loc[idx_neg, 'sentiment_compound']}):")
print(f"\"{df.loc[idx_neg, 'description']}\"")
print("-" * 80)

--------------------------------------------------------------------------------
EXAMPLES OF TWEETS BY SENTIMENT SCORE
--------------------------------------------------------------------------------

[+] HIGHLY POSITIVE (Score: 0.9977):
"I had a truly great meeting with President Xi of China. There is enormous respect between our two Countries, and that will only be enhanced with what just took place. We agreed on many things, with others, even of high importance, being very close to resolved. I was extremely honored by the fact that President Xi authorized China to begin the purchase of massive amounts of Soybeans, Sorghum, and other Farm products. Our Farmers will be very happy! In fact, as I said once before during my first Administration, Farmers should immediately go out and buy more land and larger tractors. I would like to thank President Xi for this! Additionally, China has agreed to continue the flow of Rare Earth, Critical Minerals, Magnets, etc., openly and freely. Very significantly, China has strongly stated that they will work diligently with us to stop the flow of Fentanyl into our Country. They will help us end the Fentanyl Crisis. China also agreed that they will begin the process of purchasing American Energy. In fact, a very large scale transaction may take place concerning the purchase of Oil and Gas from the Great State of Alaska. Chris Wright, Doug Burgum, and our respective Energy teams will be meeting to see if such an Energy Deal can be worked out. The agreements reached today will deliver Prosperity and Security to millions of Americans. After this Historic trip to Asia, I am now heading back to Washington, D.C. I want to thank the Great Countries of Malaysia, Japan, and South Korea for being so generous, gracious, and hospitable — Also, Australia, Canada, New Zealand, Singapore, Thailand, and Vietnam, who were at the Dinner last night hosted by His Excellency Lee Jae Myung. Hundreds of Billions of Dollars are being brought into our Country because of them. Our Nation is Strong, Respected, and Admired Again and, THE BEST IS YET TO COME!"

[-] HIGHLY NEGATIVE (Score: -0.9976):
"I have been briefed on the deadly shooting at the ICE Field Office in Dallas, Texas. It has now been revealed the deranged shooter wrote “Anti-ICE” on his shell casings. This is despicable! The Brave Men and Women of ICE are just trying to do their jobs, and remove the “WORST of the WORST” Criminals out of our Country, but they are facing an unprecedented increase in threats, violence, and attacks by Deranged Radical Leftists. This violence is the result of the Radical Left Democrats constantly demonizing Law Enforcement, calling for ICE to be demolished, and comparing ICE Officers to “Nazis.” The continuing violence from Radical Left Terrorists, in the aftermath of Charlie Kirk’s assassination, must be stopped. ICE Officers, and other Brave Members of Law Enforcement, are under grave threat. We have already declared ANTIFA a Terrorist Organization, and I will be signing an Executive Order this week to dismantle these Domestic Terrorism Networks. I AM CALLING ON ALL DEMOCRATS TO STOP THIS RHETORIC AGAINST ICE AND AMERICA’S LAW ENFORCEMENT, RIGHT NOW! The Trump Administration is fully committed to backing Law Enforcement, Strong Borders, securing our Homeland, deporting Violent Illegal Criminals, and fully rooting out the Left Wing Domestic Terrorism that is terrorizing our Country. Thank you for your attention to this matter!"
--------------------------------------------------------------------------------

df4['SPX_Vol'] = [Parkinson_Volatility(df4['SPX_High'][x], df4['SPX_Low'][x]) for x in df4.index]
df4['COMP_Vol'] = [Parkinson_Volatility(df4['COMP_High'][x], df4['COMP_Low'][x]) for x in df4.index]
df4['DJIA_Vol'] = [Parkinson_Volatility(df4['DJIA_High'][x], df4['DJIA_Low'][x]) for x in df4.index]
standardize_P(df4, 'SPX_Vol')
standardize_P(df4, 'COMP_Vol')
standardize_P(df4, 'DJIA_Vol')

fig, axs = plt.subplots(3, 1, figsize=(12, 10), sharex=True)

axs[0].plot(df4['Date'], df4['SPX_Vol'], label='SPX Vol')
axs[0].set_ylabel('SPX Volatility')
axs[0].legend(loc='upper left')

axs[1].plot(df4['Date'], df4['COMP_Vol'], label='COMP Vol')
axs[1].set_ylabel('COMP Volatility')
axs[1].legend(loc='upper left')

axs[2].plot(df4['Date'], df4['DJIA_Vol'], label='DJIA Vol')
axs[2].set_xlabel('Date')
axs[2].set_ylabel('DJIA Volatility')
axs[2].legend(loc='upper left')

plt.legend()
plt.show()

posts_with_keyword = posts[posts['contains_econ_keyword'] > 0].rename(columns={'date':'Date'})
dates_with_keyword = posts_with_keyword['Date'].unique()
num_post_key = posts_with_keyword['Date'].value_counts()

posts_without_keyword = posts[posts['contains_econ_keyword'] == 0].rename(columns={'date':'Date'})
dates_without_keyword = posts_without_keyword['Date'].unique()
num_post_without = posts_without_keyword['Date'].value_counts()

#creates date range from all posts in dataset
start_date = date.fromisoformat('2016-01-04')
end_date = date.fromisoformat('2025-11-09')
date_range = [(end_date - timedelta(days=i)).strftime('%Y-%m-%d') for i in range((end_date - start_date).days)]
dates_no_post = [date for date in date_range if date not in dates_with_keyword and date not in dates_without_keyword]

no_posts = pd.DataFrame()
df4_strdate = df4.copy()
df4_strdate['Date'] = [d.strftime('%Y-%m-%d') for d in df4['Date']]
df4_dind = df4_strdate.set_index('Date')
no_posts['Date'] = [date for date in dates_no_post if date in df4_strdate['Date'].to_list()]
no_posts['SPX_Vol_z'] = [df4_dind['SPX_Vol_z'][date] for date in no_posts['Date']]
no_posts['COMP_Vol_z'] = [df4_dind['COMP_Vol_z'][date] for date in no_posts['Date']]
no_posts['DJIA_Vol_z'] = [df4_dind['DJIA_Vol_z'][date] for date in no_posts['Date']]
no_posts

event_window_key = event_windows(df4, posts_with_keyword.drop_duplicates(subset=['Date']), 'SPX_Vol_z', 'COMP_Vol_z', 'DJIA_Vol_z')
average_key = average_event_windows(event_window_key, ['SPX_Vol_z', 'COMP_Vol_z', 'DJIA_Vol_z'])

event_window_nok = event_windows(df4, posts_without_keyword.drop_duplicates(subset=['Date']), 'SPX_Vol_z', 'COMP_Vol_z', 'DJIA_Vol_z')
average_nok = average_event_windows(event_window_nok, ['SPX_Vol_z', 'COMP_Vol_z', 'DJIA_Vol_z'])

event_window_nop = event_windows(df4, no_posts, 'SPX_Vol_z', 'COMP_Vol_z', 'DJIA_Vol_z')
average_nop = average_event_windows(event_window_nop, ['SPX_Vol_z', 'COMP_Vol_z', 'DJIA_Vol_z'])

average_key

plt.figure(figsize=(12, 6))
days = np.arange(-1,4)
plt.plot(days, average_key['SPX_Vol_z'], label="Keyword Posts SPX", color='b')
plt.plot(days, average_key['COMP_Vol_z'], label="Keyword Posts COMP", color='b', linestyle='--')
plt.plot(days, average_key['DJIA_Vol_z'], label="Keyword Posts DJIA", color='b', linestyle=':')
plt.plot(days, average_nok['SPX_Vol_z'], label="Non-Keyword Posts SPX", color='r')
plt.plot(days, average_nok['COMP_Vol_z'], label="Non-Keyword Posts COMP", color='r', linestyle='--')
plt.plot(days, average_nok['DJIA_Vol_z'], label="Non-Keyword Posts DJIA", color='r', linestyle=':')
plt.plot(days, average_nop['SPX_Vol_z'], label="No Posts SPX", color='orange')
plt.plot(days, average_nop['COMP_Vol_z'], label="No Posts COMP", color='orange', linestyle='--')
plt.plot(days, average_nop['DJIA_Vol_z'], label="No Posts DJIA", color='orange', linestyle=':')

plt.axvline(0, color="black", linestyle="--", linewidth=1)  # event day marker

plt.title("Average Abnormal Volatility Around Posts")
plt.xlabel("Event Window (days)")
plt.ylabel("Abnormal Volatility")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

	snapped_at	market_cap	total_volume	datetime
1336	1483228800000	1.841179e+10	3.924458e+09	2017-01-01
1337	1483315200000	1.883194e+10	5.077314e+09	2017-01-02
1338	1483401600000	1.923852e+10	4.989160e+09	2017-01-03
1339	1483488000000	2.104879e+10	9.438407e+09	2017-01-04
1340	1483574400000	1.857421e+10	1.286286e+10	2017-01-05

	unix	date	symbol	open	high	low	close	Volume BTC	Volume USD
0	1762815600000	2025-11-10 23:00:00	BTC/USD	105992.49	106261.48	105867.10	105957.91	6.502861	6.890296e+05
1	1762812000000	2025-11-10 22:00:00	BTC/USD	105570.78	106106.60	105348.03	105992.49	5.499902	5.829483e+05
2	1762808400000	2025-11-10 21:00:00	BTC/USD	105995.16	105995.16	105251.73	105570.78	19.287521	2.036199e+06
3	1762804800000	2025-11-10 20:00:00	BTC/USD	105769.15	106260.00	105769.15	105995.16	27.902652	2.957546e+06

	unix	symbol	open	high	low	close	Volume BTC	Volume USD	datetime
42623	1609372800000	BTC/USD	28898.55	29316.49	28889.66	29096.60	149.898636	4.361541e+06	2020-12-31 00:00:00
42624	1609369200000	BTC/USD	28713.18	28930.98	28661.93	28898.55	49.153198	1.420456e+06	2020-12-30 23:00:00
42625	1609365600000	BTC/USD	28916.95	28943.74	28606.89	28713.18	94.474535	2.712664e+06	2020-12-30 22:00:00
42626	1609362000000	BTC/USD	28783.62	28998.00	28559.79	28916.95	273.429226	7.906739e+06	2020-12-30 21:00:00
42627	1609358400000	BTC/USD	28788.87	28998.00	28637.55	28783.62	197.972330	5.698360e+06	2020-12-30 20:00:00

	Date	SPX_Open	SPX_High	SPX_Low	SPX_Close	COMP_Open	COMP_High	COMP_Low	COMP_Close	DJIA_Open	DJIA_High	DJIA_Low	DJIA_Close
13	2016-01-04	2038.20	2038.20	1989.68	2012.66	4897.65	4903.09	4846.98	4903.09	17405.48	17405.48	16957.63	17148.94
21	2016-01-05	2013.78	2021.94	2004.17	2016.71	4917.84	4926.73	4872.74	4891.43	17147.50	17195.84	17038.61	17158.66
28	2016-01-06	2011.71	2011.71	1979.05	1990.26	4813.76	4866.04	4804.69	4835.76	17154.83	17154.83	16817.62	16906.51
35	2016-01-07	1985.32	1985.32	1938.83	1943.09	4736.40	4788.02	4688.17	4689.43	16888.36	16888.36	16463.63	16514.10
41	2016-01-08	1945.97	1960.40	1918.46	1922.03	4722.02	4742.57	4637.85	4643.63	16519.17	16651.89	16314.57	16346.45

	Date	SPX_Open	SPX_High	SPX_Low	SPX_Close	COMP_Open	COMP_High	COMP_Low	COMP_Close	DJIA_Open	DJIA_High	DJIA_Low	DJIA_Close
5	2017-01-03	2251.57	2263.88	2245.13	2257.83	5425.62	5452.57	5397.99	5429.08	19872.86	19938.53	19775.93	19881.76
14	2017-01-04	2261.60	2272.82	2261.60	2270.75	5440.91	5482.35	5440.24	5477.00	19890.94	19956.14	19878.83	19942.16
22	2017-01-05	2268.18	2271.50	2260.45	2269.00	5474.39	5495.86	5464.36	5487.94	19924.56	19948.60	19811.12	19899.29
29	2017-01-06	2271.14	2282.10	2264.06	2276.98	5499.08	5536.52	5482.81	5521.06	19906.96	19999.63	19834.08	19963.80
48	2017-01-09	2273.59	2275.49	2268.90	2268.90	5527.58	5541.08	5517.14	5531.82	19931.41	19943.78	19887.38	19887.38

	platform	timestamp_et	timestamp_epoch	link	link_type	description
0	twitter	2025-11-09T17:51:18-05:00	1762728678	https://truthsocial.com/@realDonaldTrump/posts...	post	RT @ NewtGingrich The New York Post report on ...
1	twitter	2025-11-09T16:53:22-05:00	1762725202	https://truthsocial.com/@realDonaldTrump/posts...	post	NaN
2	twitter	2025-11-09T16:38:31-05:00	1762724311	https://truthsocial.com/@realDonaldTrump/posts...	post	https:// nypost.com/2025/10/24/us-news/ kash-p...
3	twitter	2025-11-09T16:27:19-05:00	1762723639	https://truthsocial.com/@realDonaldTrump/posts...	post	https:// thefederalist.com/2025/10/23/l indsey...
4	twitter	2025-11-09T16:21:39-05:00	1762723299	https://truthsocial.com/@realDonaldTrump/posts...	post	DHS sees biggest jump in public approval among...

	platform	timestamp_et	timestamp_epoch	link	link_type	description	platform_ignore
0	truth	2025-11-09T17:51:18-05:00	1762728678	https://truthsocial.com/@realDonaldTrump/posts...	post	RT @ NewtGingrich The New York Post report on ...	twitter
1	truth	2025-11-09T16:53:22-05:00	1762725202	https://truthsocial.com/@realDonaldTrump/posts...	post	NaN	twitter
2	truth	2025-11-09T16:38:31-05:00	1762724311	https://truthsocial.com/@realDonaldTrump/posts...	post	https:// nypost.com/2025/10/24/us-news/ kash-p...	twitter
3	truth	2025-11-09T16:27:19-05:00	1762723639	https://truthsocial.com/@realDonaldTrump/posts...	post	https:// thefederalist.com/2025/10/23/l indsey...	twitter
4	truth	2025-11-09T16:21:39-05:00	1762723299	https://truthsocial.com/@realDonaldTrump/posts...	post	DHS sees biggest jump in public approval among...	twitter

	platform	timestamp_et	timestamp_epoch	link	description	date	hour
0	truth	2025-11-09 17:51:18-05:00	1762728678	https://truthsocial.com/@realDonaldTrump/posts...	RT @ NewtGingrich The New York Post report on ...	2025-11-09	17
1	truth	2025-11-09 16:53:22-05:00	1762725202	https://truthsocial.com/@realDonaldTrump/posts...		2025-11-09	16
2	truth	2025-11-09 16:38:31-05:00	1762724311	https://truthsocial.com/@realDonaldTrump/posts...	https:// nypost.com/2025/10/24/us-news/ kash-p...	2025-11-09	16
3	truth	2025-11-09 16:27:19-05:00	1762723639	https://truthsocial.com/@realDonaldTrump/posts...	https:// thefederalist.com/2025/10/23/l indsey...	2025-11-09	16
4	truth	2025-11-09 16:21:39-05:00	1762723299	https://truthsocial.com/@realDonaldTrump/posts...	DHS sees biggest jump in public approval among...	2025-11-09	16

	posts_lc	sentiment_compound	sentiment_label
0	rt @ newtgingrich the new york post report on ...	0.9953	positive
1		0.0000	neutral
2	nypost.com/2025/10/24/us-news/ kash-patel-skew...	0.0000	neutral
3	thefederalist.com/2025/10/23/l indsey-halligan...	0.0000	neutral
4	dhs sees biggest jump in public approval among...	0.4767	positive

	platform	timestamp_et	timestamp_epoch	link	description	date	hour	posts_lc	kw_tariff	kw_rate	kw_trade	kw_jobs	kw_inflation	kw_fed	kw_market	contains_econ_keyword	sentiment_compound	sentiment_label
32124	twitter	2020-11-21 23:54:18-05:00	1606020858	https://x.com/realdonaldtrump/status/133037402...	Thanks Mark. It's all a continuation of the ne...	2020-11-21	23	thanks mark. it's all a continuation of the ne...	True	False	False	False	False	False	False	1	-0.3711	negative
32134	twitter	2020-11-21 23:04:31-05:00	1606017871	https://x.com/realdonaldtrump/status/133036149...	Thanks Mark. It's all a continuation of the ne...	2020-11-21	23	thanks mark. it's all a continuation of the ne...	True	False	False	False	False	False	False	1	-0.3711	negative

	tweet_datetime_utc	tweet_hour_start_utc	btc_close_price_t0
0	2020-11-22 04:54:18+00:00	2020-11-22 04:00:00+00:00	18605.81
1	2020-11-22 04:04:31+00:00	2020-11-22 04:00:00+00:00	18605.81
2	2020-11-02 19:29:20+00:00	2020-11-02 19:00:00+00:00	13576.79
3	2020-10-01 02:47:30+00:00	2020-10-01 02:00:00+00:00	10821.83
4	2020-09-11 02:15:09+00:00	2020-09-11 02:00:00+00:00	10268.87

	link	tweet_datetime_utc	pct_change_-24	pct_change_-23	pct_change_-22	pct_change_-21	pct_change_-20	pct_change_-19	pct_change_-18	pct_change_-17	...	pct_change_15	pct_change_16	pct_change_17	pct_change_18	pct_change_19	pct_change_20	pct_change_21	pct_change_22	pct_change_23	pct_change_24
0	https://x.com/realdonaldtrump/status/133037402...	2020-11-22 04:54:18+00:00	-0.018650	0.391759	0.240946	0.445130	1.029195	0.125821	0.056058	-0.293295	...	-0.054123	-0.157639	-0.037623	0.247826	-0.843285	-2.554525	-1.985724	-2.057476	-1.144212	-0.669522
1	https://x.com/realdonaldtrump/status/133036149...	2020-11-22 04:04:31+00:00	-0.018650	0.391759	0.240946	0.445130	1.029195	0.125821	0.056058	-0.293295	...	-0.054123	-0.157639	-0.037623	0.247826	-0.843285	-2.554525	-1.985724	-2.057476	-1.144212	-0.669522
2	https://x.com/realdonaldtrump/status/132334647...	2020-11-02 19:29:20+00:00	1.653631	1.904132	1.783706	0.959947	1.472292	1.816851	1.354370	1.203156	...	-0.117775	-0.275102	-0.335131	0.949488	1.639047	1.126997	1.200873	1.272981	1.404308	1.202051
3	https://x.com/realdonaldtrump/status/131149794...	2020-10-01 02:47:30+00:00	-0.402982	-0.663659	-0.707182	-1.061928	-0.957601	-1.070614	-1.012213	-1.170504	...	-3.158616	-2.158415	-1.872234	-1.938951	-2.144000	-1.948284	-1.785558	-2.025628	-2.058802	-2.019899
4	https://x.com/realdonaldtrump/status/130424204...	2020-09-11 02:15:09+00:00	1.028059	1.107230	1.221751	1.127485	0.756753	0.147923	0.115397	0.145293	...	0.314640	0.518947	0.633857	0.606006	0.869326	1.089117	1.304330	0.823070	1.033512	0.820149

Element	Data Source	Purpose
Main Line Plot	`Avg_Pct_Change`	Shows the average price trajectory of BTC relative to the event time ($t=0$).
Shaded Area	`Avg_Pct_Change` $\pm$ `Std_Pct_Change`	Represents the $\pm 1$ Standard Deviation ($\pm 1$ SD), indicating the volatility (or uncertainty) of the price movement.
Vertical Red Line	$t=0$	Marks the exact time the keyword event occurred.
Horizontal Gray Line	$0\%$	Marks the zero percent price change baseline.

New Feature	Function / Logic	Purpose
Control Group Generation	`generate_control_data`	Randomly selects $\mathbf{N=100}$ time points from the historical BTC data (`btc_hourly_{term}.csv`) to serve as "random events." This creates a baseline to establish typical market noise.
Control Plot (`fig_control`)	New dedicated plot	Visualizes the average BTC price change, $\pm 1$ SD, and trend lines for the random events.
Trend Line Visualization	Linear trend fit on $\text{Avg\_Pct\_Change}$	Dashed black lines show the fitted trend before the event ($t=-72$ to $-1$), and solid black lines show the trend after the event ($t=+1$ to $+72$).
Statistical Annotation	$\mathbf{p}_{\text{keyword}}$ and $\mathbf{p}_{\text{control}}$	The p-value from the trend shift test is displayed on the top right of each chart, quantitatively measuring if the average price experienced a statistically significant change in momentum around the event time.

	snapped_at	market_cap	total_volume	datetime
0	1451606400000	7.124298e+09	1.505954e+09	2016-01-01
1	1451692800000	7.131191e+09	8.657891e+08	2016-01-02
2	1451779200000	7.080195e+09	1.076885e+09	2016-01-03

	snapped_at	market_cap	total_volume	datetime
0	1737331200000	3.621159e+12	3.592565e+11	2025-01-20
1	1737417600000	3.664661e+12	4.010621e+11	2025-01-21
2	1737504000000	3.792118e+12	2.651147e+11	2025-01-22

	Date	SPX_Vol_z	COMP_Vol_z	DJIA_Vol_z
0	2024-11-06	0.113602	0.038729	1.238807
1	2022-07-28	1.186099	1.062864	0.997576
2	2022-06-03	-0.048789	0.276387	-0.246623
3	2022-04-27	0.862391	0.797176	0.782769
4	2022-04-26	1.545202	2.021179	1.093185
...	...	...	...	...
331	2017-10-06	-1.033221	-1.067977	-1.094201
332	2017-06-08	-0.748646	-0.762334	-0.601041
333	2016-11-25	-0.917355	-1.058072	-0.954241
334	2016-11-14	-0.469010	-0.348045	-0.570904
335	2016-10-07	-0.156875	-0.484575	-0.203838

	t	SPX_Vol_z	COMP_Vol_z	DJIA_Vol_z
0	-1	0.015386	0.002955	0.028936
1	0	0.025823	0.006683	0.042522
2	1	0.022372	0.007624	0.031434
3	2	0.028448	0.014381	0.044129
4	3	0.030760	0.011975	0.049873

COGS 108 - Trump's Social Media Posts and Market Volatility¶

Permissions¶

Link to video¶

Abstract¶

Authors¶

Research Question¶

Background and Prior Work¶

Hypothesis¶

Data¶

Data overview¶

Dataset #1 | Global Cryptocurrency Market Capitalization and Volume (Daily, 2013–2025)¶

Dataset #2 - Bitcoin Price and Volume (Hourly, 2015–2025)¶

Dataset #3 | Trump Social Media (X and Truth) Posts¶

Dataset #4 - S&P500, Dow Jones, NASDAQ Daily Values¶

Results¶

Exploratory Data Analysis¶

Tweet/Truth Keyword Filtering and Sentiment Analysis¶

Bitcoin Hourly Data Analysis¶

Analysis Summary: Time-Series Event Study¶

1. Data Calculation Methodology¶

2. Graph Visualization¶

Improvements: Statistical Trend Analysis and Control Group¶

1. Statistical Trend Analysis¶

2. Control Group Generation and Plotting¶

Interpretation of P-Values and Analysis Expansion¶

P-Value Interpretation¶

Improved Null Hypothesis¶

Bitcoin Volatility Analysis: Methodology¶

Volatility Calculation¶

Control Group¶

Analysis Results: No Significant Correlation Between Trump Tweets and Bitcoin Volatility¶

Crypto Market Cap Daily Data Analysis¶

Event-Window Analysis of Daily Crypto Volatility¶

Analysis of Social media posts data¶

ETF Data Analysis¶

Interpretation of P-Values and Analysis Expansion¶

P-Value Interpretation¶

Improved Null Hypothesis¶

Ethics¶

A. Data Collection¶

A. Data Collection¶

B. Data Storage¶

C. Analysis¶

D. Modeling¶

E. Deployment¶

Discussion and Conclusion¶