Probabilistic Graphical Models with pgmpy
¶
!pip install pgmpy
Collecting pgmpy Downloading pgmpy-0.1.2.tar.gz (147kB) [K 100% |████████████████████████████████| 153kB 5.9MB/s [?25hRequirement already satisfied (use --upgrade to upgrade): networkx>=1.8.1 in /opt/conda/lib/python3.5/site-packages (from pgmpy) Requirement already satisfied (use --upgrade to upgrade): scipy>=0.12.1 in /opt/conda/lib/python3.5/site-packages (from pgmpy) Requirement already satisfied (use --upgrade to upgrade): numpy>=1.7.0 in /opt/conda/lib/python3.5/site-packages (from pgmpy) Requirement already satisfied (use --upgrade to upgrade): nose>=1.3.0 in /opt/conda/lib/python3.5/site-packages (from pgmpy) Collecting coveralls>=0.4 (from pgmpy) Downloading coveralls-1.1-py2.py3-none-any.whl Requirement already satisfied (use --upgrade to upgrade): decorator>=3.4.0 in /opt/conda/lib/python3.5/site-packages (from networkx>=1.8.1->pgmpy) Collecting docopt>=0.6.1 (from coveralls>=0.4->pgmpy) Downloading docopt-0.6.2.tar.gz Collecting coverage>=3.6 (from coveralls>=0.4->pgmpy) Downloading coverage-4.3.4-cp35-cp35m-manylinux1_x86_64.whl (191kB) [K 100% |████████████████████████████████| 194kB 4.7MB/s [?25hRequirement already satisfied (use --upgrade to upgrade): requests>=1.0.0 in /opt/conda/lib/python3.5/site-packages (from coveralls>=0.4->pgmpy) Building wheels for collected packages: pgmpy, docopt Running setup.py bdist_wheel for pgmpy ... [?25l- | done [?25h Stored in directory: /home/jovyan/.cache/pip/wheels/d3/21/0f/5b1fc282ee2ab16b693c1a0ed9cb8fde44dbaa28d907c90ff4 Running setup.py bdist_wheel for docopt ... [?25l- done [?25h Stored in directory: /home/jovyan/.cache/pip/wheels/b2/16/5f/c33a2bb5f2dce71205f8e65cbfd05647d79d441282be31fd82 Successfully built pgmpy docopt Installing collected packages: docopt, coverage, coveralls, pgmpy Successfully installed coverage-4.3.4 coveralls-1.1 docopt-0.6.2 pgmpy-0.1.2 [33mYou are using pip version 8.1.2, however version 9.0.1 is available. You should consider upgrading via the 'pip install --upgrade pip' command.[0m
from pgmpy.factors import TabularCPD
# Declare a CPD
grade_cpd = TabularCPD(variable="G",
variable_card=3,
values=[[0.3, 0.05, 0.9, 0.5],
[0.4, 0.25, 0.08, 0.3],
[0.3, 0.7, 0.02, 0.2]],
evidence=["I", "D"],
evidence_card=[2, 2])
grade_cpd
D | D_0 | D_1 | ||
I | I_0 | I_1 | I_0 | I_1 |
G_0 | 0.3000 | 0.0500 | 0.9000 | 0.5000 |
G_1 | 0.4000 | 0.2500 | 0.0800 | 0.3000 |
G_2 | 0.3000 | 0.7000 | 0.0200 | 0.2000 |
# Declare the sudent model in pgmpy
from pgmpy.models import BayesianModel
from pgmpy.factors import TabularCPD
# Define nodes and edges
student_model = BayesianModel([("D", "G"),
("I", "G"),
("G", "L"),
("I", "S")])
#Define CPDs
grade_cpd = TabularCPD(
variable="G",
variable_card=3,
values=[[0.3, 0.05, 0.9, 0.5],
[0.4, 0.25, 0.08, 0.3],
[0.3, 0.7, 0.02, 0.2]],
evidence=["I", "D"],
evidence_card=[2, 2])
difficulty_cpd = TabularCPD(
variable="D",
variable_card=2,
values=[[0.6, 0.4]])
intel_cpd = TabularCPD(
variable="I",
variable_card=2,
values=[[0.7, 0.3]])
letter_cpd = TabularCPD(
variable="L",
variable_card=2,
values=[[0.1, 0.4, 0.99],
[0.9, 0.6, 0.01]],
evidence=["G"],
evidence_card=[3])
sat_cpd = TabularCPD(
variable="S",
variable_card=2,
values=[[0.95, 0.2],
[0.05, 0.8]],
evidence=["I"],
evidence_card=[2])
#Add CPDs to nodes and edges
student_model.add_cpds(grade_cpd, difficulty_cpd,
intel_cpd, letter_cpd,
sat_cpd)
grade_cpd
D | D_0 | D_1 | ||
I | I_0 | I_1 | I_0 | I_1 |
G_0 | 0.3000 | 0.0500 | 0.9000 | 0.5000 |
G_1 | 0.4000 | 0.2500 | 0.0800 | 0.3000 |
G_2 | 0.3000 | 0.7000 | 0.0200 | 0.2000 |
student_model.get_cpds('G')
D | D_0 | D_1 | ||
I | I_0 | I_1 | I_0 | I_1 |
G_0 | 0.3000 | 0.0500 | 0.9000 | 0.5000 |
G_1 | 0.4000 | 0.2500 | 0.0800 | 0.3000 |
G_2 | 0.3000 | 0.7000 | 0.0200 | 0.2000 |
student_model.get_parents('G')
['D', 'I']
from pgmpy.inference import VariableElimination
student_infer = VariableElimination(student_model)
prob_G = student_infer.query(variables='G')
print(prob_G['G'])
╒═════╤══════════╕
│ G │ phi(G) │
╞═════╪══════════╡
│ G_0 │ 0.4470 │
├─────┼──────────┤
│ G_1 │ 0.2714 │
├─────┼──────────┤
│ G_2 │ 0.2816 │
╘═════╧══════════╛
prob_G = student_infer.query(variables='G', evidence={'I': 1, 'D' : 0})
print(prob_G['G'])
╒═════╤══════════╕
│ G │ phi(G) │
╞═════╪══════════╡
│ G_0 │ 0.0500 │
├─────┼──────────┤
│ G_1 │ 0.2500 │
├─────┼──────────┤
│ G_2 │ 0.7000 │
╘═════╧══════════╛
prob_G = student_infer.query(variables='G', evidence={'I': 0, 'D' : 1})
print(prob_G['G'])
╒═════╤══════════╕
│ G │ phi(G) │
╞═════╪══════════╡
│ G_0 │ 0.9000 │
├─────┼──────────┤
│ G_1 │ 0.0800 │
├─────┼──────────┤
│ G_2 │ 0.0200 │
╘═════╧══════════╛
#Train Model from Data
from pgmpy.models import BayesianModel
import pandas as pd
import numpy as np
# Considering that each variable have only 2 states,
# we can generate some random data.
raw_data = np.random.randint(low=0,high=2,size=(1000, 5))
data = pd.DataFrame(raw_data,columns=["D", "I", "G","L", "S"])
print(data[: int(data.shape[0]*0.75)])
data_train = data[: int(data.shape[0] * 0.75)]
student_model = BayesianModel([("D", "G"),("I", "G"),("I", "S"),("G", "L")])
student_model.fit(data_train)
student_model.get_cpds('D')
D I G L S
0 0 1 1 1 1
1 0 0 0 0 1
2 0 1 1 0 0
3 0 0 0 0 1
4 1 1 0 1 1
5 1 0 0 0 0
6 1 1 0 1 1
7 1 0 0 0 1
8 1 1 0 0 1
9 1 0 0 0 0
10 1 1 1 1 0
11 1 1 0 0 0
12 1 1 1 1 0
13 1 0 0 1 1
14 0 1 1 1 1
15 1 0 0 1 1
16 1 1 0 1 1
17 0 0 0 1 0
18 0 0 0 0 0
19 1 1 1 1 1
20 0 0 0 1 1
21 0 0 0 0 0
22 0 0 1 0 0
23 0 0 0 1 0
24 0 0 1 1 1
25 1 0 1 1 1
26 0 1 1 1 0
27 0 1 0 0 1
28 1 1 0 0 1
29 1 0 0 0 0
.. .. .. .. .. ..
720 1 1 1 0 1
721 0 0 1 1 1
722 1 1 1 0 1
723 1 0 0 0 0
724 1 0 0 1 1
725 0 0 1 1 1
726 0 0 1 0 1
727 0 0 1 0 0
728 1 1 1 0 0
729 1 1 0 0 0
730 0 0 0 0 0
731 1 1 1 0 0
732 0 0 1 0 1
733 1 1 0 1 0
734 0 1 0 1 1
735 0 1 1 1 0
736 1 0 1 0 0
737 1 1 1 1 0
738 1 1 1 1 1
739 0 1 1 0 0
740 1 1 1 1 1
741 1 0 1 1 0
742 0 1 1 0 1
743 1 0 1 1 0
744 0 1 1 1 1
745 1 0 0 0 1
746 1 0 0 1 0
747 0 1 1 0 1
748 1 0 0 1 1
749 0 0 1 1 1
[750 rows x 5 columns]
D_0 | 0.4400 |
D_1 | 0.5600 |
student_model.get_cpds('L')
G | G_0 | G_1 |
L_0 | 0.4545 | 0.5000 |
L_1 | 0.5455 | 0.5000 |
student_model.active_trail_nodes('D')
{'D', 'G', 'L'}
student_model.local_independencies('G')
(G _|_ S | D, I)
student_model.get_independencies()
(G _|_ L, I, S | D)
(G _|_ L, I, D | S)
(G _|_ I, S, D | L)
(G _|_ L, D | I)
(D _|_ I, S | G)
(D _|_ G, L | S)
(D _|_ G, I, S | L)
(D _|_ G, L | I)
(S _|_ I, D | G)
(S _|_ G, I, L | D)
(S _|_ G, I, D | L)
(L _|_ G, I, S | D)
(L _|_ G, I, D | S)
(L _|_ G, D | I)
(I _|_ D, S | G)
(I _|_ G, S, L | D)
(I _|_ G, L | S)
(I _|_ G, D, S | L)
data_test = data[int(0.75 * data.shape[0]) : data.shape[0]]
data_test.drop('G', axis=1, inplace=True)
student_model.predict(data_test)
/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
app.launch_new_instance()
G | |
---|---|
750 | 1 |
751 | 0 |
752 | 0 |
753 | 0 |
754 | 0 |
755 | 0 |
756 | 0 |
757 | 1 |
758 | 0 |
759 | 1 |
760 | 0 |
761 | 0 |
762 | 1 |
763 | 1 |
764 | 0 |
765 | 1 |
766 | 1 |
767 | 0 |
768 | 0 |
769 | 1 |
770 | 1 |
771 | 1 |
772 | 0 |
773 | 1 |
774 | 1 |
775 | 0 |
776 | 1 |
777 | 1 |
778 | 0 |
779 | 0 |
... | ... |
970 | 1 |
971 | 0 |
972 | 1 |
973 | 1 |
974 | 1 |
975 | 0 |
976 | 1 |
977 | 0 |
978 | 1 |
979 | 1 |
980 | 1 |
981 | 1 |
982 | 1 |
983 | 0 |
984 | 1 |
985 | 1 |
986 | 1 |
987 | 0 |
988 | 0 |
989 | 1 |
990 | 0 |
991 | 0 |
992 | 0 |
993 | 0 |
994 | 0 |
995 | 1 |
996 | 1 |
997 | 1 |
998 | 0 |
999 | 0 |
250 rows × 1 columns