Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"python-envs.pythonProjects": []
}
21 changes: 21 additions & 0 deletions stock-scraper/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Python-generated files
__pycache__/
*.py[oc]
build/
dist/
wheels/
*.egg-info

# Ruff
**.ruff_cache/

# Virtual environments
.venv

# Vagrant
.vagrant

# user data directory
_userdatadir/
_userdatachrome/
_userdatafirefox/
1 change: 1 addition & 0 deletions stock-scraper/.python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.12
27 changes: 27 additions & 0 deletions stock-scraper/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# note: call scripts from /scripts
SHELL := /bin/bash # use this instead of /bin/bash

PROG_ROOT := webapp
PROG_NAME := main.py

.PHONY: all sync format purge
all: sync format-check

# cleans up everything including database and venv
purge: cleandb
-rm -rf .venv
-rm -rf .tox
-rm -rf __pycache__

# format the code
format:
# uv tool run ruff format .
uv format -- --line-length 120

format-check:
uv format --diff
uv format --check

# sync the dependencies including venv
sync:
uv sync
21 changes: 21 additions & 0 deletions stock-scraper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Stock Scraper


### Setup

```sh
uv format # format python code
uv sync # sync the repositories

# copy chrome configs, see chrome://version in the browser
cp -r ~/.config/google-chrome/ _userdatachrome

# copy firefox configs, see about:profiles

uv run main.py

```

### Notes

The first time the script is run, you have to manually authenticate yourself.
69 changes: 69 additions & 0 deletions stock-scraper/a.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Scrapes the tickers from the website
# import libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
import time, os, csv, sys
from selenium.webdriver.common.action_chains import ActionChains

def main():
# Set options (prevents the browser from closing after opening)
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
# Open the browser
print('Opening browser')
driver = webdriver.Chrome(options=options)
print('Opened browser')
actions = ActionChains(driver)
# Maximize the browser to fullscreen
driver.maximize_window()
print('Maxed browser')
# Open the link
driver.implicitly_wait(10)
driver.get('https://screener.musaffa.com/cabinet/onboarding')
page_number = 1
for i in range(759):
# gather company names
company_names = driver.find_element(By.CLASS_NAME, 'table--body').find_elements(By.CLASS_NAME, "mb-0.company--name")
company_names = list(map(lambda name: name.text, company_names))

# gather company tickers
company_tickers = driver.find_element(By.CLASS_NAME, 'table--body').find_elements(By.CLASS_NAME, "mb-0.stock--name")
company_tickers = list(map(lambda name: name.text, company_tickers))

# save the links
save(company_names, company_tickers, page_number)
# go to the next page
if page_number != 3:
go_to_next_page(driver)
page_number += 1


def go_to_next_page(driver):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3.5)
driver.find_element(By.CLASS_NAME, 'bi.bi-chevron-right').click()




def save(company_names, company_tickers, page_number):
i = 0
with open('musaffa_tickers.csv', 'a') as file:
writer = csv.DictWriter(file, fieldnames=["company_name", "company_ticker", "page_number"])
if page_number == 1:
writer.writeheader()
for company_name in company_names:
writer.writerow({"company_name": company_name,
"company_ticker": company_tickers[i],
"page_number": page_number})
i += 1
print(f"Saved page {page_number}")





if __name__ == "__main__":
main()
69 changes: 69 additions & 0 deletions stock-scraper/b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Scrapes the tickers from the website
# import libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
import time, os, csv, sys
from selenium.webdriver.common.action_chains import ActionChains

def main():
# Set options (prevents the browser from closing after opening)
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
# Open the browser
print('Opening browser')
driver = webdriver.Chrome(options=options)
print('Opened browser')
actions = ActionChains(driver)
# Maximize the browser to fullscreen
driver.maximize_window()
print('Maxed browser')
# Open the link
driver.implicitly_wait(10)
driver.get('https://screener.musaffa.com/cabinet/onboarding')
page_number = 1
for i in range(759):
# gather company names
company_names = driver.find_element(By.CLASS_NAME, 'table--body').find_elements(By.CLASS_NAME, "mb-0.company--name")
company_names = list(map(lambda name: name.text, company_names))

# gather company tickers
company_tickers = driver.find_element(By.CLASS_NAME, 'table--body').find_elements(By.CLASS_NAME, "mb-0.stock--name")
company_tickers = list(map(lambda name: name.text, company_tickers))

# save the links
save(company_names, company_tickers, page_number)
# go to the next page
if page_number != 3:
go_to_next_page(driver)
page_number += 1


def go_to_next_page(driver):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3.5)
driver.find_element(By.CLASS_NAME, 'bi.bi-chevron-right').click()




def save(company_names, company_tickers, page_number):
i = 0
with open('musaffa_tickers.csv', 'a') as file:
writer = csv.DictWriter(file, fieldnames=["company_name", "company_ticker", "page_number"])
if page_number == 1:
writer.writeheader()
for company_name in company_names:
writer.writerow({"company_name": company_name,
"company_ticker": company_tickers[i],
"page_number": page_number})
i += 1
print(f"Saved page {page_number}")





if __name__ == "__main__":
main()
92 changes: 92 additions & 0 deletions stock-scraper/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import selenium
import selenium.webdriver
import selenium.webdriver.chrome.options
import selenium.webdriver.common.window

# from selenium.webdriver.common.by import By
import time
import argparse

def parse_arguments():
"""Parses command-line arguments."""
parser = argparse.ArgumentParser(description="Stock Scraper Application")
# add an initial login parser
parser.add_argument(
"--initial-login",
action="store_true",
help="Perform initial login if specified.",
)
# add parser for user-data-dir
parser.add_argument(
"--chrome-user-data-dir",
action="store",
default="_userdatachrome",
type=str,
help="Absolute path to the user data directory for chrome.",
)
parser.add_argument(
"--chrome-profile-directory",
action="store",
default="Defaults",
type=str,
help="Name of the profile directory for chrome.",
)

return parser.parse_args()

def main():
args = parse_arguments()

chrome(args)
#firefox(args)

def firefox(args):
# Access the value of --initial-login
login_sleep_interval = 5
if args.initial_login:
login_sleep_interval = 5 * 60

# Set up Firefox options
options = selenium.webdriver.FirefoxOptions()


def chrome(args):
# Access the value of --initial-login
login_sleep_interval = 5
if args.initial_login:
login_sleep_interval = 5 * 60

# Set up Chrome options
options = selenium.webdriver.chrome.options.Options()
# used for using your current users' data directory
options.add_argument("--no-sandbox")
options.add_argument(f"--user-data-dir={args.chrome_user_data_dir}")
options.add_argument(f"--profile-directory={args.chrome_profile_directory}")
options.add_argument("--remote-debugging-port=9222")

# Create a new instance of the Chrome driver
driver = selenium.webdriver.Chrome(options=options)

# Store the ID of the original window
original_window = driver.current_window_handle
print(driver.get_cookies())
driver.get("https://musaffa.com")
print(driver.get_cookies())
# Navigate to the Google homepage
#new_window = driver.switch_to().new_window(selenium.webdriver.common.window.WindowType.WINDOW)
#print(new_window)
#new_window.maximize_window()
#print("maxed")
#new_window.get("https://musaffa.com")

time.sleep(login_sleep_interval) # Pause for some time to either login manually
#new_window.close()

# Close the browser
#driver.switch_to.window(original_window)
driver.quit()



if __name__ == "__main__":
main()
9 changes: 9 additions & 0 deletions stock-scraper/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[project]
name = "stock-scraper"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"selenium>=4.35.0",
]
17 changes: 17 additions & 0 deletions stock-scraper/ruff.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#target-version = "py310"
#src = ["src"]

# Use a different line length.
line-length = 130

[lint.pydocstyle]
# Use Google-style docstrings.
convention = "google"

[lint.isort]
# docs.astral.sh/ruff/settings/#lint_isort_split-on-trailing-comma
# split-on-trailing-comma = false

[format]
# https://docs.astral.sh/ruff/settings/#format_skip-magic-trailing-comma
# skip-magic-trailing-comma = true
Empty file.
Empty file.
46 changes: 46 additions & 0 deletions stock-scraper/stockscraper/sourceinfo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import yaml
import os


class SourceInfo:
def __init__(
self,
url: str,
name: str,
destination_directory: str,
comment: str,
parsing_function,
expected_min_count: int,
check_intvl: int,
):
self.url = url
self.comment = comment
self.parsing_function = parsing_function
self.destination_directory = destination_directory
self.name = name
self.expected_min_count = expected_min_count
self.check_intvl = check_intvl # in seconds
self._file_path = os.path.join(destination_directory, f"{name}.yaml")
self._tmp_file_path = os.path.join(destination_directory, "temp", f"{name}.yml")
self._last_check = self._get_last_check()
self._stocks = []

def _get_last_check(self):
if os.path.exists(self._file_path):
with open(self._file_path, "r") as f:
data = yaml.safe_load(f)
if data and "timestamp" in data:
return data["timestamp"]
return None # Return None if file doesn't exist or timestamp not found

def create_temp_directory(self):
if os.path.exists(self.destination_directory):
os.makedirs(os.path.dirname(self._tmp_file_path), exist_ok=True)


class StocksInfo:
def __init__(self, symbol: str, name: str, exchange: str, country: str):
self.symbol = symbol
self.name = name
self.exchange = exchange
self.country = country
11 changes: 11 additions & 0 deletions stock-scraper/stockscraper/sources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from sourceinfo import SourceInfo

SOURCES = [
SourceInfo(
url="",
name="",
comment="",
parsing_function=None,
temporary_file_name="",
),
]
Loading