I wonder how to add regression line equation and R^2 on the ggplot. My code is:

library(ggplot2)

df <- data.frame(x = c(1:100))
df$y <- 2 + 3 * df$x + rnorm(100, sd = 40)
p <- ggplot(data = df, aes(x = x, y = y)) +
geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
geom_point()
p


Any help will be highly appreciated.

## Solution 1

Here is one solution

# GET EQUATION AND R-SQUARED AS STRING

lm_eqn <- function(df){
m <- lm(y ~ x, df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(unname(coef(m)), digits = 2),
b = format(unname(coef(m)), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3))) as.character(as.expression(eq)); } p1 <- p + geom_text(x = 25, y = 300, label = lm_eqn(df), parse = TRUE)  EDIT. I figured out the source from where I picked this code. Here is the link to the original post in the ggplot2 google groups ## Solution 2 Statistic stat_poly_eq() in my package ggpmisc makes it possible add text labels based on a linear model fit. This answer has been updated for 'ggpmisc' (>= 0.4.0) and 'ggplot2' (>= 3.3.0) on 2022-06-02. In the examples I use stat_poly_line() instead of stat_smooth() as it has the same defaults as stat_poly_eq() for method and formula. I have omitted in all code examples the additional arguments to stat_poly_line() as they are irrelevant to the question of adding labels. library(ggplot2) library(ggpmisc) #> Loading required package: ggpp #> #> Attaching package: 'ggpp' #> The following object is masked from 'package:ggplot2': #> #> annotate # artificial data df <- data.frame(x = c(1:100)) df$y <- 2 + 3 * df$x + rnorm(100, sd = 40) df$yy <- 2 + 3 * df$x + 0.1 * df$x^2 + rnorm(100, sd = 40)

# using default formula, label and methods
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq() +
geom_point()


# assembling a single label with equation and R2
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()


# adding separate labels with equation and R2
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(aes(label = after_stat(eq.label))) +
stat_poly_eq(label.y = 0.9) +
geom_point()


# regression through the origin
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line(formula = y ~ x + 0) +
stat_poly_eq(formula = y ~ x + 0, aes(label = after_stat(eq.label))) +
geom_point()


# fitting a polynomial
ggplot(data = df, aes(x = x, y = yy)) +
stat_poly_line(formula = y ~ poly(x, 2, raw = TRUE)) +
stat_poly_eq(formula = y ~ poly(x, 2, raw = TRUE),
aes(label = after_stat(eq.label))) +
geom_point()


# adding a hat as asked by @MYaseen208 and @elarry
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(eq.with.lhs = "italic(hat(y))~=~",
aes(label = paste(after_stat(eq.label),
after_stat(rr.label), sep = "*\", \"*"))) +
geom_point()


# variable substitution as asked by @shabbychef
# same labels in equation and axes
ggplot(data = df, aes(x = x, y = y)) +
stat_poly_line() +
stat_poly_eq(eq.with.lhs = "italic(h)~=~",
eq.x.rhs = "~italic(z)",
aes(label = after_stat(eq.label))) +
labs(x = expression(italic(z)), y = expression(italic(h))) +
geom_point()


# grouping as asked by @helen.h
dfg <- data.frame(x = c(1:100))
dfg$y <- 20 * c(0, 1) + 3 * df$x + rnorm(100, sd = 40)
dfg$group <- factor(rep(c("A", "B"), 50)) ggplot(data = dfg, aes(x = x, y = y, colour = group)) + stat_poly_line() + stat_poly_eq(aes(label = paste(after_stat(eq.label), after_stat(rr.label), sep = "*\", \"*"))) + geom_point()  ggplot(data = dfg, aes(x = x, y = y, linetype = group, grp.label = group)) + stat_poly_line() + stat_poly_eq(aes(label = paste(after_stat(grp.label), "*\": \"*", after_stat(eq.label), "*\", \"*", after_stat(rr.label), sep = ""))) + geom_point()  # a single fit with grouped data as asked by @Herman ggplot(data = dfg, aes(x = x, y = y)) + stat_poly_line() + stat_poly_eq(aes(label = paste(after_stat(eq.label), after_stat(rr.label), sep = "*\", \"*"))) + geom_point(aes(colour = group))  # facets ggplot(data = dfg, aes(x = x, y = y)) + stat_poly_line() + stat_poly_eq(aes(label = paste(after_stat(eq.label), after_stat(rr.label), sep = "*\", \"*"))) + geom_point() + facet_wrap(~group)  Created on 2022-06-02 by the reprex package (v2.0.1) ## Solution 3 I changed a few lines of the source of stat_smooth and related functions to make a new function that adds the fit equation and R squared value. This will work on facet plots too! library(devtools) source_gist("524eade46135f6348140") df = data.frame(x = c(1:100)) df$y = 2 + 5 * df$x + rnorm(100, sd = 40) df$class = rep(1:2,50)
ggplot(data = df, aes(x = x, y = y, label=y)) +
stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE) +
geom_smooth(method="lm",se=FALSE) +
geom_point() + facet_wrap(~class)


I used the code in @Ramnath's answer to format the equation. The stat_smooth_func function isn't very robust, but it shouldn't be hard to play around with it.

https://gist.github.com/kdauria/524eade46135f6348140. Try updating ggplot2 if you get an error.

## Solution 4

I've modified Ramnath's post to a) make more generic so it accepts a linear model as a parameter rather than the data frame and b) displays negatives more appropriately.

lm_eqn = function(m) {

l <- list(a = format(coef(m), digits = 2),
b = format(abs(coef(m)), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)); if (coef(m) >= 0) { eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l) } else { eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l) } as.character(as.expression(eq)); }  Usage would change to: p1 = p + geom_text(aes(x = 25, y = 300, label = lm_eqn(lm(y ~ x, df))), parse = TRUE)  ## Solution 5 Here's the most simplest code for everyone Note: Showing Pearson's Rho and not R^2. library(ggplot2) library(ggpubr) df <- data.frame(x = c(1:100) df$y <- 2 + 3 * df$x + rnorm(100, sd = 40) p <- ggplot(data = df, aes(x = x, y = y)) + geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) + geom_point()+ stat_cor(label.y = 35)+ #this means at 35th unit in the y axis, the r squared and p value will be shown stat_regline_equation(label.y = 30) #this means at 30th unit regresion line equation will be shown p  ## Solution 6 Using ggpubr: library(ggpubr) # reproducible data set.seed(1) df <- data.frame(x = c(1:100)) df$y <- 2 + 3 * df$x + rnorm(100, sd = 40) # By default showing Pearson R ggscatter(df, x = "x", y = "y", add = "reg.line") + stat_cor(label.y = 300) + stat_regline_equation(label.y = 280)  # Use R2 instead of R ggscatter(df, x = "x", y = "y", add = "reg.line") + stat_cor(label.y = 300, aes(label = paste(..rr.label.., ..p.label.., sep = "~,~"))) + stat_regline_equation(label.y = 280) ## compare R2 with accepted answer # m <- lm(y ~ x, df) # round(summary(m)$r.squared, 2)
#  0.85


## Solution 7

really love @Ramnath solution. To allow use to customize the regression formula (instead of fixed as y and x as literal variable names), and added the p-value into the printout as well (as @Jerry T commented), here is the mod:

lm_eqn <- function(df, y, x){
formula = as.formula(sprintf('%s ~ %s', y, x))
m <- lm(formula, data=df);
# formating the values into a summary string to print out
# ~ give some space, but equal size and comma need to be quoted
eq <- substitute(italic(target) == a + b %.% italic(input)*","~~italic(r)^2~"="~r2*","~~p~"="~italic(pvalue),
list(target = y,
input = x,
a = format(as.vector(coef(m)), digits = 2),
b = format(as.vector(coef(m)), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3), # getting the pvalue is painful pvalue = format(summary(m)$coefficients[2,'Pr(>|t|)'], digits=1)
)
)
as.character(as.expression(eq));
}

geom_point() +
ggrepel::geom_text_repel(label=rownames(mtcars)) +
geom_text(x=3,y=300,label=lm_eqn(mtcars, 'hp','wt'),color='red',parse=T) +
geom_smooth(method='lm')


Unfortunately, this doesn't work with facet_wrap or facet_grid.

## Solution 8

Another option would be to create a custom function generating the equation using dplyr and broom libraries:

get_formula <- function(model) {

broom::tidy(model)[, 1:2] %>%
mutate(sign = ifelse(sign(estimate) == 1, ' + ', ' - ')) %>% #coeff signs
mutate_if(is.numeric, ~ abs(round(., 2))) %>% #for improving formatting
mutate(a = ifelse(term == '(Intercept)', paste0('y ~ ', estimate), paste0(sign, estimate, ' * ', term))) %>%
summarise(formula = paste(a, collapse = '')) %>%
as.character

}

lm(y ~ x, data = df) -> model
get_formula(model)
#"y ~ 6.22 + 3.16 * x"

scales::percent(summary(model)$r.squared, accuracy = 0.01) -> r_squared  Now we need to add the text to the plot: p + geom_text(x = 20, y = 300, label = get_formula(model), color = 'red') + geom_text(x = 20, y = 285, label = r_squared, color = 'blue')  ## Solution 9 Inspired by the equation style provided in this answer, a more generic approach (more than one predictor + latex output as option) can be: print_equation= function(model, latex= FALSE, ...){ dots <- list(...) cc= model$coefficients
var_sign= as.character(sign(cc[-1]))%>%gsub("1","",.)%>%gsub("-"," - ",.)
var_sign[var_sign==""]= ' + '

f_args_abs= f_args= dots
f_args$x= cc f_args_abs$x= abs(cc)
cc_= do.call(format, args= f_args)
cc_abs= do.call(format, args= f_args_abs)
pred_vars=
cc_abs%>%
paste(., x_vars, sep= star)%>%
paste(var_sign,.)%>%paste(., collapse= "")

if(latex){
star= " \\cdot "
y_var= strsplit(as.character(model$call$formula), "~")[]%>%
paste0("\\hat{",.,"_{i}}")
x_vars= names(cc_)[-1]%>%paste0(.,"_{i}")
}else{
star= " * "
y_var= strsplit(as.character(model$call$formula), "~")[]
x_vars= names(cc_)[-1]
}

equ= paste(y_var,"=",cc_,pred_vars)
if(latex){
equ= paste0(equ," + \\hat{\\varepsilon_{i}} \\quad where \\quad \\varepsilon \\sim \\mathcal{N}(0,",
summary(MetamodelKdifEryth)$sigma,")")%>%paste0("$",.,"$") } cat(equ) }  The model argument expects an lm object, the latex argument is a boolean to ask for a simple character or a latex-formated equation, and the ... argument pass its values to the format function. I also added an option to output it as latex so you can use this function in a rmarkdown like this:  {r echo=FALSE, results='asis'} print_equation(model = lm_mod, latex = TRUE)   Now using it: df <- data.frame(x = c(1:100)) df$y <- 2 + 3 * df$x + rnorm(100, sd = 40) df$z <- 8 + 3 * df\$x + rnorm(100, sd = 40)
lm_mod= lm(y~x+z, data = df)

print_equation(model = lm_mod, latex = FALSE)


This code yields: y = 11.3382963933174 + 2.5893419 * x + 0.1002227 * z

And if we ask for a latex equation, rounding the parameters to 3 digits:

print_equation(model = lm_mod, latex = TRUE, digits= 3)


This yields: