"""
This part of code is the Q learning brain, which is a brain of the agent.
All decisions are made in here.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""importnumpyasnpimportpandasaspdclassRL(object):def__init__(self,action_space,learning_rate=0.01,reward_decay=0.9,e_greedy=0.9):self.actions=action_space# a listself.lr=learning_rateself.gamma=reward_decayself.epsilon=e_greedyself.q_table=pd.DataFrame(columns=self.actions,dtype=np.float64)defcheck_state_exist(self,state):ifstatenotinself.q_table.index:# append new state to q tableself.q_table=self.q_table.append(pd.Series([0]*len(self.actions),index=self.q_table.columns,name=state,))defchoose_action(self,observation):self.check_state_exist(observation)# action selectionifnp.random.rand()<self.epsilon:# choose best actionstate_action=self.q_table.loc[observation,:]# some actions may have the same value, randomly choose on in these actionsaction=np.random.choice(state_action[state_action==np.max(state_action)].index)else:# choose random actionaction=np.random.choice(self.actions)returnactiondeflearn(self,*args):pass# off-policyclassQLearningTable(RL):def__init__(self,actions,learning_rate=0.01,reward_decay=0.9,e_greedy=0.9):super(QLearningTable,self).__init__(actions,learning_rate,reward_decay,e_greedy)deflearn(self,s,a,r,s_):self.check_state_exist(s_)q_predict=self.q_table.loc[s,a]ifs_!='terminal':q_target=r+self.gamma*self.q_table.loc[s_,:].max()# next state is not terminalelse:q_target=r# next state is terminalself.q_table.loc[s,a]+=self.lr*(q_target-q_predict)# update# on-policyclassSarsaTable(RL):def__init__(self,actions,learning_rate=0.01,reward_decay=0.9,e_greedy=0.9):super(SarsaTable,self).__init__(actions,learning_rate,reward_decay,e_greedy)deflearn(self,s,a,r,s_,a_):self.check_state_exist(s_)q_predict=self.q_table.loc[s,a]ifs_!='terminal':q_target=r+self.gamma*self.q_table.loc[s_,a_]# next state is not terminalelse:q_target=r# next state is terminalself.q_table.loc[s,a]+=self.lr*(q_target-q_predict)# update
"""
This part of code is the Q learning brain, which is a brain of the agent.
All decisions are made in here.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""importnumpyasnpimportpandasaspdclassRL(object):def__init__(self,action_space,learning_rate=0.01,reward_decay=0.9,e_greedy=0.9):self.actions=action_space# a listself.lr=learning_rateself.gamma=reward_decayself.epsilon=e_greedyself.q_table=pd.DataFrame(columns=self.actions,dtype=np.float64)defcheck_state_exist(self,state):ifstatenotinself.q_table.index:# append new state to q tableself.q_table=self.q_table.append(pd.Series([0]*len(self.actions),index=self.q_table.columns,name=state,))defchoose_action(self,observation):self.check_state_exist(observation)# action selectionifnp.random.rand()<self.epsilon:# choose best actionstate_action=self.q_table.loc[observation,:]# some actions may have the same value, randomly choose on in these actionsaction=np.random.choice(state_action[state_action==np.max(state_action)].index)else:# choose random actionaction=np.random.choice(self.actions)returnactiondeflearn(self,*args):pass# backward eligibility tracesclassSarsaLambdaTable(RL):def__init__(self,actions,learning_rate=0.01,reward_decay=0.9,e_greedy=0.9,trace_decay=0.9):super(SarsaLambdaTable,self).__init__(actions,learning_rate,reward_decay,e_greedy)# backward view, eligibility trace.self.lambda_=trace_decayself.eligibility_trace=self.q_table.copy()defcheck_state_exist(self,state):ifstatenotinself.q_table.index:# append new state to q tableto_be_append=pd.Series([0]*len(self.actions),index=self.q_table.columns,name=state,)self.q_table=self.q_table.append(to_be_append)# also update eligibility traceself.eligibility_trace=self.eligibility_trace.append(to_be_append)deflearn(self,s,a,r,s_,a_):self.check_state_exist(s_)q_predict=self.q_table.loc[s,a]ifs_!='terminal':q_target=r+self.gamma*self.q_table.loc[s_,a_]# next state is not terminalelse:q_target=r# next state is terminalerror=q_target-q_predict# increase trace amount for visited state-action pair# Method 1:# self.eligibility_trace.loc[s, a] += 1# Method 2:self.eligibility_trace.loc[s,:]*=0self.eligibility_trace.loc[s,a]=1# Q updateself.q_table+=self.lr*error*self.eligibility_trace# decay eligibility trace after updateself.eligibility_trace*=self.gamma*self.lambda_