\n

\n \n 2025\n \n \n (2)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Eligibility Traces for Confounding Robust Off-Policy Evaluation: A Causal Approach.\n \n \n \n \n\n\n \n Zhang, J.; and Bareinboim, E.\n\n\n \n\n\n\n UAI-25. The 41st Conference on Uncertainty in Artificial Intelligence. 2025.\n \n\n\n\n
\n\n\n\n \n \n $\"Eligibility$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 7 downloads\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{zhang2025eligibility,\n  title={Eligibility Traces for Confounding Robust Off-Policy Evaluation: A Causal Approach},\n  author={Zhang, Junzhe and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">UAI-25.</span> The 41st Conference on Uncertainty in Artificial Intelligence},\n  year={2025},\n  url_Proceedings={/assets/pdf/r19.pdf},\n  abstract={A unifying theme in Artificial Intelligence is learning an effective policy to control an agent in an unknown environment in order to optimize a certain performance measure. Off-policy methods can significantly improve sample efficiency during training, since they allow an agent to learn from observed trajectories generated by different behavior policies, without directly deploying target policies in the underlying environment. This paper studies off-policy evaluation from biased offline data where (1) unobserved confounding bias cannot be ruled out a priori; or (2) the observed trajectories do not overlap with intended behaviors of the learner, i.e., the target and behavior policies do not share a common support. Specifically, we extend Bellman's equation to derive effective closed-form bounds over value functions from the observational distribution contaminated with unobserved confounding and no overlap. Second, we propose two novel algorithms that use eligibility traces to estimate these bounds from finite observational data. Compared to other methods for robust off-policy evaluation in sequential environments, these methods are model-free and extend, for the first time, the well-celebrated temporal difference algorithms (Sutton, 1988) to biased offline data with unobserved confounding and no overlap.}\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Automatic Reward Shaping from Confounded Offline Data.\n \n \n \n \n\n\n \n Li, M.; Zhang, J.; and Bareinboim, E.\n\n\n \n\n\n\n ICML-25. The 42nd International Conference on Machine Learning. 2025.\n \n\n\n\n
\n\n\n\n \n \n $\"Automatic$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{li2025automatic,\n  title={Automatic Reward Shaping from Confounded Offline Data},\n  author={Li, Mingxuan and Zhang, Junzhe and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">ICML-25.</span> The 42nd International Conference on Machine Learning},\n  year={2025},\n  url_Proceedings={/assets/pdf/r18.pdf},\n  abstract={Reward shaping has been demonstrated to be an effective technique for accelerating the learning process of reinforcement learning (RL) agents.\n While successful in empirical applications, the design of a good shaping function is less well understood in principle and thus often relies on domain expertise and manual design. To overcome this limitation, we propose a novel automated approach for designing reward functions from offline data, possibly contaminated with the unobserved confounding bias. We propose to use causal state value upper bounds calculated from offline datasets as a conservative optimistic estimation of the optimal state value, which is then used as state potentials in Potential-Based Reward Shaping (PBRS). When applying our shaping function to a model-free learner based on UCB principles, we show that it enjoys a better gap-dependent regret bound than the learner without shaping. To the best of our knowledge, this is the first gap-dependent regret bound for PBRS in model-free learning with online exploration. Simulations support the theoretical findings.}\n}\n\n

\n

\n\n\n\n\n\n

\n

\n \n 2024\n \n \n (3)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Causal Imitation for Markov Decision Processes: a Partial Identification Approach.\n \n \n \n \n\n\n \n Ruan, K. *; Zhang, J. *; Di, X.; and Bareinboim, E.\n\n\n \n\n\n\n NeurIPS-24. Advances in Neural Information Processing Systems. 2024.\n \n\n(* Equal contribution)\n\n
\n\n\n\n \n \n $\"Causal$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 19 downloads\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{ruan2024imitation,\n  title={Causal Imitation for Markov Decision Processes: a Partial Identification Approach},\n  author={Ruan, Kangrui * and Zhang, Junzhe * and Di, Xuan and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">NeurIPS-24.</span> Advances in Neural Information Processing Systems},\n  year={2024},\n  bibbase_note={(* Equal contribution)},\n  url_Proceedings={/assets/pdf/r17.pdf},\n  abstract={Imitation learning enables an agent to learn from expert demonstrations when the performance measure is unknown and the reward signal is not specified. Standard imitation methods do not generally apply when the learner and the expert's sensory capabilities mismatch and demonstrations are contaminated with unobserved confounding bias. To address these challenges, recent advancements in causal imitation learning have been pursued. However, these methods often require access to underlying causal structures that might not always be available, posing practical challenges. In this paper, we investigate robust imitation learning within the framework of canonical Markov Decision Processes (MDPs) using partial identification, allowing the agent to achieve expert performance even when the system dynamics are not uniquely determined from the confounded expert demonstrations. Specifically, first, we theoretically demonstrate that when unobserved confounders (UCs) exist in an MDP, the learner is generally unable to imitate expert performance. We then explore imitation learning in partially identifiable settings --- either transition distribution or reward function is non-identifiable from the available data and knowledge. Augmenting the celebrated GAIL method (Ho & Ermon, 2016), our analysis leads to two novel causal imitation algorithms that can obtain effective policies guaranteed to achieve expert performance.}\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Towards safe policy learning under partial identifiability: A causal approach.\n \n \n \n \n\n\n \n Joshi, S. *; Zhang, J. *; and Bareinboim, E.\n\n\n \n\n\n\n AAAI-24. Proceedings of the AAAI Conference on Artificial Intelligence, 38(12): 13004–13012. 2024.\n \n\n(* Equal contribution)\n\n
\n\n\n\n \n \n $\"Towards$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 11 downloads\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{joshi2024towards,\n  title={Towards safe policy learning under partial identifiability: A causal approach},\n  author={Joshi, Shalmali * and Zhang, Junzhe * and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">AAAI-24.</span> Proceedings of the AAAI Conference on Artificial Intelligence},\n  volume={38},\n  number={12},\n  pages={13004--13012},\n  year={2024},\n  bibbase_note={(* Equal contribution)},\n  url_Proceedings={/assets/pdf/r15.pdf},\n  abstract={Learning personalized treatment policies is a formative challenge in many real-world applications, including in healthcare, econometrics, artificial intelligence. However, the effectiveness of candidate policies is not always identifiable, i.e., it is not uniquely computable from the combination of the available data and assumptions about the generating mechanisms. This paper studies policy learning from data collected in various non-identifiable settings, i.e., (1) observational studies with unobserved confounding; (2) randomized experiments with partial observability; and (3) their combinations. We derive sharp, closed-formed bounds from observational and experimental data over the conditional treatment effects. Based on these novel bounds, we further characterize the problem of safe policy learning and develop an algorithm that trains a policy from data guaranteed to achieve, at least, the performance of the baseline policy currently deployed. Finally, we validate our proposed algorithm on synthetic data and a large clinical trial, demonstrating that it guarantees safe behaviors and robust performance.}\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Scores for learning discrete causal graphs with unobserved confounders.\n \n \n \n \n\n\n \n Bellot, A.; Zhang, J.; and Bareinboim, E.\n\n\n \n\n\n\n AAAI-24. Proceedings of the AAAI Conference on Artificial Intelligence, 38(10): 11043–11051. 2024.\n \n\n\n\n
\n\n\n\n \n \n $\"Scores$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 8 downloads\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{bellot2024scores,\n  title={Scores for learning discrete causal graphs with unobserved confounders},\n  author={Bellot, Alexis and Zhang, Junzhe and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">AAAI-24.</span> Proceedings of the AAAI Conference on Artificial Intelligence},\n  volume={38},\n  number={10},\n  pages={11043--11051},\n  year={2024},\n  url_Proceedings={/assets/pdf/r14.pdf},\n  abstract={Structural learning is arguably one of the most challenging and pervasive tasks found throughout the data sciences. There exists a growing literature that studies structural learning in non-parametric settings where conditional independence constraints are taken to define the equivalence class. In the presence of unobserved confounders, it is understood that non-conditional independence constraints are imposed over the observational distribution, including certain equalities and inequalities between functionals of the joint distribution. In this paper, we develop structural learning methods that leverage additional constraints beyond conditional independences. Specifically, we first introduce a score for arbitrary graphs combining Watanabe's asymptotic expansion of the marginal likelihood and new bounds over the cardinality of the exogenous variables. Second, we show that the new score has desirable properties in terms of expressiveness and computability. In terms of expressiveness, we prove that the score captures distinct constraints imprinted in the data, including Verma's and inequalities'. In terms of computability, we show properties of score equivalence and decomposability, which allows, in principle, to break the problem of structural learning in smaller and more manageable pieces. Third, we implement this score using an MCMC sampling algorithm and test its properties in several simulation scenarios.}\n}\n\n

\n

\n\n\n\n\n\n

\n

\n \n 2023\n \n \n (1)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Causally Aligned Curriculum Learning.\n \n \n \n \n\n\n \n Li, M.; Zhang, J.; and Bareinboim, E.\n\n\n \n\n\n\n ICLR-23. The Twelfth International Conference on Learning Representations. 2023.\n \n\n\n\n
\n\n\n\n \n \n $\"Causally$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 5 downloads\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{li2023causally,\n  title={Causally Aligned Curriculum Learning},\n  author={Li, Mingxuan and Zhang, Junzhe and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">ICLR-23.</span> The Twelfth International Conference on Learning Representations},\n  year={2023},\n  url_Proceedings={/assets/pdf/r16.pdf},\n  abstract={A pervasive challenge in Reinforcement Learning (RL) is the “curse of dimensionality” which is the exponential growth in the state-action space when optimizing a high-dimensional target task (Bellman, 95). The framework of curriculum learning trains the agent in a curriculum composed of a sequence of related and more manageable source tasks. The expectation is that when some optimal decision rules are shared across source tasks and the target task, the agent could more quickly pick up the necessary skills to behave optimally in the environment, thus accelerating the learning process. However, this critical assumption of invariant optimal decision rules does not necessarily hold in many practical applications, specifically when the underlying environment contains unobserved confounders. This paper studies the problem of curriculum RL through causal lenses. We derive a sufficient graphical condition characterizing causally aligned source tasks, i.e., the invariance of optimal decision rules holds. We further develop an efficient algorithm to generate a causally aligned curriculum, provided with qualitative causal knowledge of the target environment. Finally, we validate our proposed methodology through experiments in high-dimensional confounded environments.}\n}\n\n

\n

\n\n\n\n\n\n

\n

\n \n 2022\n \n \n (4)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Causal imitation learning via inverse reinforcement learning.\n \n \n \n \n\n\n \n Ruan, K. *; Zhang, J. *; Di, X.; and Bareinboim, E.\n\n\n \n\n\n\n ICLR-22. The Eleventh International Conference on Learning Representations. 2022.\n \n\n(* Equal contribution)\n\n
\n\n\n\n \n \n $\"Causal$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{ruan2022causal,\n  title={Causal imitation learning via inverse reinforcement learning},\n  author={Ruan, Kangrui * and Zhang, Junzhe * and Di, Xuan and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">ICLR-22.</span> The Eleventh International Conference on Learning Representations},\n  year={2022},\n  bibbase_note={(* Equal contribution)},\n  url_Proceedings={/assets/pdf/r13.pdf},\n  abstract={One of the most common ways children learn when unfamiliar with the environment is by mimicking adults. Imitation learning concerns an imitator learning to behave in an unknown environment from an expert’s demonstration; reward signals remain latent to the imitator. This paper studies imitation learning through causal lenses and extends the analysis and tools developed for behavior cloning (Zhang, Kumor, Bareinboim, 2020) to inverse reinforcement learning. First, we propose novel graphical conditions that allow the imitator to learn a policy performing as well as the expert’s behavior policy, even when the imitator and the expert’s state-action space disagree, and unobserved confounders (UCs) are present. When provided with parametric knowledge about the unknown reward function, such a policy may outperform the expert’s. Also, our method is easily extensible and allows one to leverage existing IRL algorithms even when UCs are present, including the multiplicative-weights algorithm (MWAL) (Syed & Schapire, 2008) and the generative adversarial imitation learning (GAIL) (Ho & Ermon, 2016). Finally, we validate our framework by simulations using real-world and synthetic data.}\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Online reinforcement learning for mixed policy scopes.\n \n \n \n \n\n\n \n Zhang, J.; and Bareinboim, E.\n\n\n \n\n\n\n NeurIPS-22. Advances in Neural Information Processing Systems, 35: 3191–3202. 2022.\n \n\n\n\n
\n\n\n\n \n \n $\"Online$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 5 downloads\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{zhang2022online,\n  title={Online reinforcement learning for mixed policy scopes},\n  author={Zhang, Junzhe and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">NeurIPS-22.</span> Advances in Neural Information Processing Systems},\n  volume={35},\n  pages={3191--3202},\n  year={2022},\n  url_Proceedings={/assets/pdf/r12.pdf},\n  abstract={Combination therapy refers to the use of multiple treatments – such as surgery, medication, and behavioral therapy - to cure a single disease, and has become a cornerstone for treating various conditions including cancer, HIV, and depression. All possible combinations of treatments lead to a collection of treatment regimens (i.e., policies) with mixed scopes, or what physicians could observe and which actions they should take depending on the context. In this paper, we investigate the online reinforcement learning setting for optimizing the policy space with mixed scopes. In particular, we develop novel online algorithms that achieve sublinear regret compared to an optimal agent deployed in the environment. The regret bound has a dependency on the maximal cardinality of the induced state-action space associated with mixed scopes. We further introduce a canonical representation for an arbitrary subset of interventional distributions given a causal diagram, which leads to a non-trivial, minimal representation of the model parameters}\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Partial counterfactual identification from observational and experimental data.\n \n \n \n \n\n\n \n Zhang, J.; Tian, J.; and Bareinboim, E.\n\n\n \n\n\n\n ICML-22. International Conference on Machine Learning,26548–26558. 2022.\n \n\n\n\n
\n\n\n\n \n \n $\"Partial$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 12 downloads\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{zhang2022partial,\n  title={Partial counterfactual identification from observational and experimental data},\n  author={Zhang, Junzhe and Tian, Jin and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">ICML-22.</span> International Conference on Machine Learning},\n  pages={26548--26558},\n  year={2022},\n  organization={PMLR},\n  url_Proceedings={/assets/pdf/r11.pdf},\n  abstract={This paper investigates the problem of bounding counterfactual queries from an arbitrary collection of observational and experimental distributions and qualitative knowledge about the underlying data-generating model represented in the form of a causal diagram. We show that all counterfactual distributions in an arbitrary structural causal model (SCM) with discrete observed domains could be generated by a canonical family of SCMs with the same causal diagram where unobserved (exogenous) variables are also discrete, taking values in finite domains. Utilizing the canonical SCMs, we translate the problem of bounding counterfactuals into that of polynomial programming whose solution provides optimal bounds for the counterfactual query. Solving such polynomial programs is in general computationally expensive. We therefore develop effective Monte Carlo algorithms to approximate optimal bounds from a combination of observational and experimental data. Our algorithms are validated extensively on synthetic and real-world datasets.}\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Can Humans Be out of the Loop?.\n \n \n \n \n\n\n \n Zhang, J.; and Bareinboim, E.\n\n\n \n\n\n\n CLeaR-22. Conference on Causal Learning and Reasoning,1010–1025. 2022.\n \n\n\n\n
\n\n\n\n \n \n $\"Can$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{zhang2022can,\n  title={Can Humans Be out of the Loop?},\n  author={Zhang, Junzhe and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">CLeaR-22.</span> Conference on Causal Learning and Reasoning},\n  pages={1010--1025},\n  year={2022},\n  organization={PMLR},\n  url_Proceedings={/assets/pdf/r10.pdf},\n  abstract={Recent advances in Reinforcement Learning have allowed automated agents (for short, agents) to achieve a high level of performance across a wide range of tasks, which when supplemented with human feedback has led to faster and more robust decision-making. The current literature, in large part, focuses on the human's role during the learning phase:  human trainers possess a priori knowledge that could help an agent to accelerate its learning when the environment is not fully known. In this paper, we study an interactive reinforcement learning setting where the agent and the human have different sensory capabilities, disagreeing, therefore, on how they perceive the world (observed states) while sharing the same reward and transition functions. We show that agents are bound to learn sub-optimal policies if they do not take into account human advice, perhaps surprisingly, even when human's decisions are less accurate than their own. We propose the counterfactual agent who proactively considers the intended actions of the human operator, and proves that this strategy dominates standard approaches regarding performance. Finally, we formulate a novel reinforcement learning task maximizing the performance of an autonomous system subject to a budget constraint over the available amount of human advice.}\n}\n\n

\n

\n\n\n\n\n\n

\n

\n \n 2021\n \n \n (2)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Sequential causal imitation learning with unobserved confounders.\n \n \n \n \n\n\n \n Kumor, D.; Zhang, J.; and Bareinboim, E.\n\n\n \n\n\n\n UAI-24. Advances in Neural Information Processing Systems, 34: 14669–14680. 2021.\n \n\n\n\n
\n\n\n\n \n \n $\"Sequential$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{kumor2021sequential,\n  title={Sequential causal imitation learning with unobserved confounders},\n  author={Kumor, Daniel and Zhang, Junzhe and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">UAI-24.</span> Advances in Neural Information Processing Systems},\n  volume={34},\n  pages={14669--14680},\n  year={2021},\n  url_Proceedings={/assets/pdf/r9.pdf},\n  abstract={"Monkey see monkey do" is an age-old adage, referring to naive imitation without a deep understanding of a system's underlying mechanics. Indeed, if a demonstrator has access to information unavailable to the imitator (monkey), such as a different set of sensors, then no matter how perfectly the imitator models its perceived environment (See), attempting to reproduce the demonstrator's behavior (Do) can lead to poor outcomes. Imitation learning in the presence of a mismatch between demonstrator and imitator has been studied in the literature under the rubric of causal imitation learning (Zhang, Kumor, and Bareinboim, 2020), but existing solutions are limited to single-stage decision-making. This paper investigates the problem of causal imitation learning in sequential settings, where the imitator must make multiple decisions per episode. We develop a graphical criterion that is necessary and sufficient for determining the feasibility of causal imitation, providing conditions when an imitator can match a demonstrator's performance despite differing capabilities. Finally, we provide an efficient algorithm for determining imitability and corroborate our theory with simulations.}\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Bounding causal effects on continuous outcome.\n \n \n \n \n\n\n \n Zhang, J.; and Bareinboim, E.\n\n\n \n\n\n\n AAAI-21. Proceedings of the AAAI Conference on Artificial Intelligence, 35(13): 12207–12215. 2021.\n \n\n\n\n
\n\n\n\n \n \n $\"Bounding$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{zhang2021bounding,\n  title={Bounding causal effects on continuous outcome},\n  author={Zhang, Junzhe and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">AAAI-21.</span> Proceedings of the AAAI Conference on Artificial Intelligence},\n  volume={35},\n  number={13},\n  pages={12207--12215},\n  year={2021},\n  url_Proceedings={/assets/pdf/r8.pdf},\n  abstract={We investigate the problem of bounding causal effects from experimental studies in which treatment assignment is randomized but the subject compliance is imperfect. It is well known that under such conditions, the actual causal effects are not point-identifiable due to uncontrollable unobserved confounding. In their seminal work, Balke and Pearl (1994) derived the tightest bounds over the causal effects in this settings by employing an algebra program to derive analytic expressions. However, Pearl’s approach assumes the primary outcome to be discrete and finite. Solving such a program could be intractable when high-dimensional context variables are present. In this paper, we present novel non-parametric methods to bound causal effects on the continuous outcome from studies with imperfect compliance. These bounds could be generalized to settings with the high-dimensional context.}\n}\n\n

\n

\n\n\n\n\n\n

\n

\n \n 2020\n \n \n (2)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Causal imitation learning with unobserved confounders.\n \n \n \n \n\n\n \n Zhang, J.; Kumor, D.; and Bareinboim, E.\n\n\n \n\n\n\n NeurIPS-20.Advances in neural information processing systems, 33: 12263–12274. 2020.\n \n\n\n\n
\n\n\n\n \n \n $\"Causal$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{zhang2020causal,\n  title={Causal imitation learning with unobserved confounders},\n  author={Zhang, Junzhe and Kumor, Daniel and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">NeurIPS-20.</span>Advances in neural information processing systems},\n  volume={33},\n  pages={12263--12274},\n  year={2020},\n  url_Proceedings={/assets/pdf/r7.pdf},\n  abstract={One of the common ways children learn is by mimicking adults. Imitation learning focuses on learning policies with suitable performance from demonstrations generated by an expert, with an unspecified performance measure, and unobserved reward signal. Popular methods for imitation learning start by either directly mimicking the behavior policy of an expert (behavior cloning) or by learning a reward function that prioritizes observed expert trajectories (inverse reinforcement learning). However, these methods rely on the assumption that covariates used by the expert to determine her/his actions are fully observed. In this paper, we relax this assumption and study imitation learning when sensory inputs of the learner and the expert differ. First, we provide a non-parametric, graphical criterion that is complete (both necessary and sufficient) for determining the feasibility of imitation from the combinations of demonstration data and qualitative assumptions about the underlying environment, represented in the form of a causal model. We then show that when such a criterion does not hold, imitation could still be feasible by exploiting quantitative knowledge of the expert trajectories. Finally, we develop an efficient procedure for learning the imitating policy from experts' trajectories.}\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Designing optimal dynamic treatment regimes: A causal reinforcement learning approach.\n \n \n \n \n\n\n \n Zhang, J.; and Bareinboim, E.\n\n\n \n\n\n\n ICML-20. International conference on machine learning,11012–11022. 2020.\n \n\n\n\n
\n\n\n\n \n \n $\"Designing$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{zhang2020designing,\n  title={Designing optimal dynamic treatment regimes: A causal reinforcement learning approach},\n  author={Zhang, Junzhe and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">ICML-20.</span> International conference on machine learning},\n  pages={11012--11022},\n  year={2020},\n  organization={PMLR},\n  url_Proceedings={/assets/pdf/r6.pdf},\n  abstract={A dynamic treatment regime (DTR) consists of a sequence of decision rules, one per stage of intervention, that dictates how to determine the treatment assignment to patients based on evolving treatments and covariates' history. These regimes are particularly effective for managing chronic disorders and is arguably one of the critical ingredients underlying more personalized decision-making systems. All reinforcement learning algorithms for finding the optimal DTR in online settings will suffer \\(\\Omega(\\sqrt{|X \\cup S|T})\\) regret on some environments, where \\(T\\) is the number of experiments and \\(X \\cup S\\) is the domains of the treatments \\(X\\) and covariates \\(S\\). This implies that \\(T = \\Omega(|X \\cup S|)\\) trials will be required to generate an optimal DTR. In many applications, the domains of \\(X\\) and \\(S\\) could be enormous, which means that the time required to ensure appropriate learning may be unattainable. We show that, if the causal diagram of the underlying environment is provided, one could achieve regret that is exponentially smaller than \\(X \\cup S\\). In particular, we develop two online algorithms that satisfy such regret bounds by exploiting the causal structure underlying the DTR; one is the based on the principle of optimism in the face of uncertainty, and the other uses the posterior sampling learning. Finally, we introduce efficient methods to accelerate these online learning procedures by leveraging the abundant, yet biased observational (non-experimental) data.}\n}\n\n

\n

\n\n\n\n\n\n

\n

\n \n 2019\n \n \n (1)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Near-optimal reinforcement learning in dynamic treatment regimes.\n \n \n \n \n\n\n \n Zhang, J.; and Bareinboim, E.\n\n\n \n\n\n\n NeurIPS-19. Advances in Neural Information Processing Systems, 32. 2019.\n \n\n\n\n
\n\n\n\n \n \n $\"Near-optimal$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{zhang2019near,\n  title={Near-optimal reinforcement learning in dynamic treatment regimes},\n  author={Zhang, Junzhe and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">NeurIPS-19.</span> Advances in Neural Information Processing Systems},\n  volume={32},\n  year={2019},\n  url_Proceedings={/assets/pdf/r5.pdf},\n  abstract={A dynamic treatment regime (DTR) consists of a sequence of decision rules, one per stage of intervention, that dictates how to determine the treatment assignment to patients based on evolving treatments and covariates' history. These regimes are particularly effective for managing chronic disorders and is arguably one of the key aspects towards more personalized decision-making. In this paper, we investigate the online reinforcement learning (RL) problem for selecting optimal DTRs provided that observational data is available. We develop the first adaptive algorithm that achieves near-optimal regret in DTRs in online settings, without any access to historical data. We further derive informative bounds on the system dynamics of the underlying DTR from confounded, observational data. Finally, we combine these results and develop a novel RL algorithm that efficiently learns the optimal DTR while leveraging the abundant, yet imperfect confounded observations.}\n}\n\n

\n

\n\n\n\n\n\n

\n

\n \n 2018\n \n \n (3)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Equality of opportunity in classification: A causal approach.\n \n \n \n \n\n\n \n Zhang, J.; and Bareinboim, E.\n\n\n \n\n\n\n NeurIPS-18. Advances in neural information processing systems, 31. 2018.\n \n\n\n\n
\n\n\n\n \n \n $\"Equality$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{zhang2018equality,\n  title={Equality of opportunity in classification: A causal approach},\n  author={Zhang, Junzhe and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">NeurIPS-18.</span> Advances in neural information processing systems},\n  volume={31},\n  year={2018},\n  url_Proceedings={/assets/pdf/r4.pdf},\n  abstract={The Equalized Odds (for short, EO) is one of the most popular measures of discrimination used in the supervised learning setting. It ascertains fairness through the balance of the misclassification rates (false positive and negative) across the protected groups – e.g., in the context of law enforcement, an African-American defendant who would not commit a future crime will have an equal opportunity of being released, compared to a non-recidivating Caucasian defendant. Despite this noble goal, it has been acknowledged in the literature that statistical tests based on the EO are oblivious to the underlying causal mechanisms that generated the disparity in the first place (Hardt et al. 2016). This leads to a critical disconnect between statistical measures readable from the data and the meaning of discrimination in the legal system, where compelling evidence that the observed disparity is tied to a specific causal process deemed unfair by society is required to characterize discrimination. The goal of this paper is to develop a principled approach to connect the statistical disparities characterized by the EO and the underlying, elusive, and frequently unobserved, causal mechanisms that generated such inequality. We start by introducing a new family of counterfactual measures that allows one to explain the misclassification disparities in terms of the underlying mechanisms in an arbitrary, non-parametric structural causal model. This will, in turn, allow legal and data analysts to interpret currently deployed classifiers through causal lens, linking the statistical disparities found in the data to the corresponding causal processes. Leveraging the new family of counterfactual measures, we develop a learning procedure to construct a classifier that is statistically efficient, interpretable, and compatible with the basic human intuition of fairness. We demonstrate our results through experiments in both real (COMPAS) and synthetic datasets.}\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Non-parametric path analysis in structural causal models.\n \n \n \n \n\n\n \n Zhang, J.; and Bareinboim, E.\n\n\n \n\n\n\n MeurIPS-18. Proceedings of the 34th Conference on Uncertainty in Artificial Intelligence. 2018.\n \n\n\n\n
\n\n\n\n \n \n $\"Non-parametric$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{zhang2018non,\n  title={Non-parametric path analysis in structural causal models},\n  author={Zhang, Junzhe and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">MeurIPS-18.</span> Proceedings of the 34th Conference on Uncertainty in Artificial Intelligence},\n  year={2018},\n  url_Proceedings={/assets/pdf/r3.pdf},\n  abstract={One of the fundamental tasks in causal inference is to decompose the observed association between a decision X and an outcome Y into its most basic structural mechanisms. In this paper, we introduce counterfactual measures for effects along with a specific mechanism, represented as a path from X to Y in an arbitrary structural causal model. We derive a novel non-parametric decomposition formula that expresses the covariance of X and Y as a sum over unblocked paths from X to Y contained in an arbitrary causal model. This formula allows a fine-grained path analysis without requiring a commitment to any particular parametric form, and can be seen as a generalization of Wright’s decomposition method in linear systems (1923,1932) and Pearl’s nonparametric mediation formula (2001).}\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Fairness in decision-making—the causal explanation formula.\n \n \n \n \n\n\n \n Zhang, J.; and Bareinboim, E.\n\n\n \n\n\n\n AAAI-18., Proceedings of the AAAI Conference on Artificial Intelligence, 32(1). 2018.\n \n\n\n\n
\n\n\n\n \n \n $\"Fairness$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 12 downloads\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{zhang2018fairness,\n  title={Fairness in decision-making—the causal explanation formula},\n  author={Zhang, Junzhe and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">AAAI-18.</span>, Proceedings of the AAAI Conference on Artificial Intelligence},\n  volume={32},\n  number={1},\n  year={2018},\n  url_Proceedings={/assets/pdf/r2.pdf},\n  abstract={AI plays an increasingly prominent role in society since decisions that were once made by humans are now delegated to automated  systems. These systems are currently in charge of deciding bank loans, criminals' incarceration, and the hiring of new employees, and it's not difficult to envision that they will in the future underpin most of the decisions in society. Despite the high complexity entailed by this task, there is still not much understanding of basic properties of such systems. For instance, we currently cannot detect (neither explain nor correct) whether an AI system is operating fairly (i.e., is abiding by the decision-constraints agreed by society) or it is reinforcing biases and perpetuating a preceding prejudicial practice. Issues of discrimination have been discussed extensively in legal circles, but there exists still not much understanding of the formal conditions that an automated system must adhere to be deemed fair.  In this paper, we use the language of structural causality (Pearl, 2000) to fill in this gap. We start by introducing three new fine-grained measures of transmission of change from stimulus to effect called counterfactual direct (Ctf-DE), indirect (Ctf-IE), and spurious (Ctf-SE) effects. Building on these measures, we derive the causal explanation formula, which allows the AI designer to quantitatively evaluate fairness and explain the total observed disparity of decisions through different discriminatory mechanisms.  We apply these results to various discrimination analysis tasks and run extensive simulations, including detection, evaluation, and optimization of decision-making under fairness constraints. We conclude studying the trade-off between different types of fairness criteria (outcome and procedural), and provide a quantitative approach to policy implementation and the design of fair decision-making systems.}\n}\n\n

\n

\n\n\n\n\n\n

\n

\n \n 2017\n \n \n (1)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Transfer learning in multi-armed bandits: a causal approach.\n \n \n \n \n\n\n \n Zhang, J.; and Bareinboim, E.\n\n\n \n\n\n\n IJCAI-17. Proceedings of the 26th International Joint Conference on Artificial Intelligence,1340–1346. 2017.\n \n\n\n\n
\n\n\n\n \n \n $\"Transfer$ proceedings\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{zhang2017transfer,\n  title={Transfer learning in multi-armed bandits: a causal approach},\n  author={Zhang, Junzhe and Bareinboim, Elias},\n  journal={<span style="color: #0088cc; font-style: normal">IJCAI-17.</span> Proceedings of the 26th International Joint Conference on Artificial Intelligence},\n  pages={1340--1346},\n  year={2017},\n  url_Proceedings={/assets/pdf/r1.pdf},\n  abstract={Reinforcement learning (RL) agents have been deployed in complex environments where interactions are costly, and learning is usually slow. One prominent task in these settings is to reuse interactions performed by other agents to accelerate the learning process. Causal inference provides a family of methods to infer the effects of actions from a combination of data and qualitative assumptions about the underlying environment. Despite its success of transferring invariant knowledge across domains in the empirical sciences, causal inference has not been fully realized in the context of transfer learning in interactive domains. In this paper, we use causal inference as a basis to support a principled and more robust transfer of knowledge in RL settings. In particular, we tackle the problem of transferring knowledge across bandit agents in settings where causal effects cannot be identified y do-calculus (Pearl, 2000) and standard learning techniques. Our new identification strategy combines two steps – first, deriving bounds over the arms distribution based on structural knowledge; second, incorporating these bounds in a dynamic allocation procedure so as to guide the search towards more promising actions. We formally prove that our strategy dominates previously known algorithms and achieves orders of magnitude faster convergence rates than these algorithms. Finally, we perform simulations and empirically demonstrate that our strategy is consistently more efficient than the current (non-causal) state-of-the-art methods.}\n}

\n

\n\n\n\n\n\n

\n