Examples¶
Python¶
Add missing values in one column.
<<<
import pandas
from pyensae.mlhelper import add_missing_indices
df = pandas.DataFrame([{"x": 3, "y": 4, "z": 1}, {"x": 5, "y": 6, "z": 2}])
df2 = add_missing_indices(df, "x", [3, 4, 5, 6])
print(df2)
>>>
x y z
0 3 4 1
4 3 6 2
1 4 4 1
5 4 6 2
2 5 4 1
6 5 6 2
3 6 4 1
7 6 6 2
<<<
import pandas
from pyensae.mlhelper import add_missing_indices
df = pandas.DataFrame([{"x": 3, "y": 4, "z": 1}, {"x": 5, "y": 6, "z": 2}])
df2 = add_missing_indices(df, "x", values=["y"], all_values=[3, 4, 5, 6])
print(df2)
>>>
x y z
0 3 4.0 1
4 3 NaN 2
1 4 NaN 1
5 4 NaN 2
2 5 NaN 1
6 5 6.0 2
3 6 NaN 1
7 6 NaN 2
(original entry : missing.py:docstring of pyensae.mlhelper.missing.add_missing_indices, line 10)
(original entry : antlr_grammar_build.py:docstring of pyensae.languages.antlr_grammar_build.build_grammar, line 14)
Compute the average returns and correlation matrix
import pyensae, pandas
from pyensae.finance import StockPrices
from pyensae.datasource import download_data
# download the CAC 40 composition from my website (for Yahoo)
download_data('cac40_2013_11_11.txt', website='xd')
# download all the prices (if not already done) and store them into files
actions = pandas.read_csv("cac40_2013_11_11.txt", sep="\t")
# we remove stocks with not enough historical data
stocks = { k:StockPrices(tick = k) for k,v in actions.values }
dates = StockPrices.available_dates(stocks.values())
stocks = {k:v for k,v in stocks.items() if len(v.missing(dates)) <= 10}
print("nb left", len(stocks))
# we remove dates with missing prices
dates = StockPrices.available_dates(stocks.values())
ok = dates[dates["missing"] == 0]
print("all dates before", len(dates), " after:" , len(ok))
for k in stocks:
stocks[k] = stocks[k].keep_dates(ok)
# we compute correlation matrix and returns
ret, cor = StockPrices.covariance(stocks.values(), cov = False, ret = True)
(original entry : astock.py:docstring of pyensae.finance.astock.StockPrices, line 52)
Convert R into Python
<<<
rscript = '''
nb=function(y=1930){
debut=1816
MatDFemale=matrix(D$Female,nrow=111)
colnames(MatDFemale)=(debut+0):198
cly=(y-debut+1):111
deces=diag(MatDFemale[:,cly[cly%in%1:199]])
return(c(B$Female[B$Year==y],deces))}
'''
from pyensae.languages.rconverter import r2python
print(r2python(rscript, pep8=True))
>>>
ANTLR runtime and generated code versions disagree: 4.13.0!=4.10.1
ANTLR runtime and generated code versions disagree: 4.13.0!=4.10.1
/usr/local/lib/python3.9/site-packages/autopep8.py:1726: PendingDeprecationWarning: lib2to3 package is deprecated and may not be able to parse Python 3.10+
from lib2to3 import pgen2
from python2r_helper import make_tuple
def nb(y=1930):
debut = 1816
MatDFemale = matrix(D . Female, nrow=111)
colnames(MatDFemale) .set(range((debut + 0), 198))
cly = range((y - debut + 1), 111)
deces = diag(MatDFemale[:, cly[set(cly) & set(range(1, 199))]])
return make_tuple(B . Female[B . Year == y], deces)
(original entry : rconverter.py:docstring of pyensae.languages.rconverter.r2python, line 14)
Cross join with a pandas dataframe
<<<
import pandas
from pyensae.mlhelper import df_crossjoin
df = pandas.DataFrame([{"x": 3, "y": 4}, {"x": 5, "y": 6}])
jj = df_crossjoin(df, df.copy())
>>>
A dataframe cannot be joined on itself, the second one musrt be copied.
(original entry : joins.py:docstring of pyensae.mlhelper.joins.df_crossjoin, line 11)
Display an inline map with folium in a notebook
import folium
map_osm = folium.Map(location=[48.85, 2.34])
from pyensae.notebook_helper import folium_html_map
map_osm.polygon_marker(location=[48.824338, 2.302641], popup='ENSAE',
fill_color='#132b5e', num_sides=3, radius=10)
folium_html_map(map_osm)
With folium version 0.2, this becomes easier:
import folium
map_osm = folium.Map(location=[48.85, 2.34])
from pyensae.notebook_helper import folium_html_map
map_osm.polygon_marker(location=[48.824338, 2.302641], popup='ENSAE',
fill_color='#132b5e', num_sides=3, radius=10)
map_osm
(original entry : folium_helper.py:docstring of pyensae.notebookhelper.folium_helper.folium_html_map, line 14)
Download data for a practical lesson
from pyensae.datasource import download_data
download_data('voeux.zip', website='xd')
(original entry : http_retrieve.py:docstring of pyensae.datasource.http_retrieve.download_data, line 33)
Download data from a website
download_data("facebook.tar.gz", website="http://snap.stanford.edu/data/")
(original entry : http_retrieve.py:docstring of pyensae.datasource.http_retrieve.download_data, line 41)
Draw a grammar graph for a small code
from pyensae.languages import get_parser_lexer, parse_code, get_tree_graph
from pyensae.graph_helper import run_dot
code = '''
namespace hello
{
public static class world
{
public static double function(double x, doubly y)
{
return x+y ;
}
}
}
'''
clparser, cllexer = get_parser_lexer("C#")
parser = parse_code(code, clparser, cllexer)
tree = parser.compilation_unit()
st = get_tree_graph(tree, parser)
dot = st.to_dot()
with open(name, "w") as f:
f.write(dot)
img = os.path.join(temp, "graph.png")
run_dot(name, img)
(original entry : tree_graph_listener.py:docstring of pyensae.languages.tree_graph_listener.TreeGraphListener, line 4)
Loads French departments polygons
Simple example to retrieve French departements.
<<<
from pyensae.datasource import load_french_departements
df = load_french_departements()
print(df.head(2).T)
>>>
/usr/local/lib/python3.9/site-packages/shapefile.py:1302: ResourceWarning: unclosed file <_io.FileIO name='./GEOFLA_2-1_DEPARTEMENT_SHP_LAMB93_FXX_2015-12-01.7z' mode='rb' closefd=True>
record.points = list(izip(*(iter(flat),) * 2))
ResourceWarning: Enable tracemalloc to get the object allocation traceback
0 1
geometry POLYGON ((3.023296506544766 47.86102220986016,... POLYGON ((4.361587131774929 46.138458636221074...
CODE_DEPT 89 69
CODE_REG 27 84
CODE_CHF 024 381
ID_GEOFLA DEPARTEM0000000000000004 DEPARTEM0000000000000028
NOM_CHF AUXERRE LYON
NOM_DEPT YONNE RHONE
NOM_REG BOURGOGNE-FRANCHE-COMTE AUVERGNE-RHONE-ALPES
X_CENTROID 748211 832095
X_CHF_LIEU 742447 842221
Y_CENTROID 6750855 6530600
Y_CHF_LIEU 6744261 6520526
(original entry : geodata.py:docstring of pyensae.datasource.geodata.load_french_departements, line 6)
Retrieve stock prices from the Yahoo source
from pyensae.finance import StockPrices
prices = StockPrices(tick="NASDAQ:MSFT")
print(prices.dataframe.head())
(original entry : astock.py:docstring of pyensae.finance.astock.StockPrices, line 4)
graph of a financial series
from pyensae.finance import StockPrices
stocks = [ StockPrices("NASDAQ:MSFT", folder = cache),
StockPrices("NASDAQ:GOOGL", folder = cache),
StockPrices("NASDAQ:AAPL", folder = cache)]
fig, ax, plt = StockPrices.draw(stocks)
fig.savefig("image.png")
fig, ax, plt = StockPrices.draw(stocks, begin="2010-01-01", figsize=(16,8))
plt.show()
You can also chain the graphs and add a series on a second graph:
from pyensae.finance import StockPrices
stock = StockPrices("NASDAQ:MSFT", folder = cache)
stock2 = StockPrices "NASDAQ:GOOGL", folder = cache)
fig, ax, plt = stock.plot(figsize=(16,8))
fig, ax, plt = stock2.plot(existing=(fig,ax), axis=2)
plt.show()
(original entry : astock.py:docstring of pyensae.finance.astock.StockPrices.draw, line 24)
Azure¶
Hadoop¶
SQL¶
Export the results of a SQL query into a flat file
from pyensae.sql.database_main import Database
dbfile = "filename.db3"
filetxt = "fileview.txt"
sql = "..."
db = Database(dbfile)
db.connect()
db.export_view_into_flat_file (sql, fileview, header = True)
db.close()
(original entry : database_import_export.py:docstring of pyensae.sql.database_import_export.DatabaseImportExport.export_table_into_flat_file, line 13)
Import a flat file into a SQLite database
from pyensae import import_flatfile_into_database
dbf = "database.db3"
file = "textfile.txt"
import_flatfile_into_database(dbf, file)
On Windows, SQLiteSpy is a free tool very useful to run SQL queries against a sqlite3 database.
(original entry : database_helper.py:docstring of pyensae.sql.database_helper.import_flatfile_into_database, line 16)
import a DataFrame into a SQL table
values = [ {"name":"A", "age":10, "score":34.5 },
{"name":"B", "age":20, "score":-34.5 }, ]
df = pandas.DataFrame(values)
dbf = "something.db3"
db = Database.fill_sql_table(df, dbf, "mytable")
This example could be replaced by:
values = [ {"name":"A", "age":10, "score":34.5 },
{"name":"B", "age":20, "score":-34.5 }, ]
df = pandas.DataFrame(values)
dbf = "something.db3"
db = Database(dbf)
db.connect()
db.import_dataframe(df, "mytable)
db.close()
(original entry : database_main.py:docstring of pyensae.sql.database_main.Database.fill_sql_table, line 16)
run a select command on a table
t = Database (file)
cur = t.execute ("SELECT * FROM table1 ;")
for f in cur :
print(f)
cur.close ()
(original entry : database_core.py:docstring of pyensae.sql.database_core.DatabaseCore.execute, line 7)