Reshaping data from long-ish to wide (and back to fully long)

Today we had a hackathon-style session on changing data from long-ish format (in which participants represented in a dataset may have more than one row — for example, representing timepoints) to wide format (in which each participant is represented in only one row), and then back to fully long (in which each row represents one participant-timepoint-variable combination).

Here’s some annotated example / reference code derived from what we drafted in-session:

First, we’ll load in some randomly-generated example data:


# Normally, we could read this from a CSV. For this post, though, we can read our data directly as an R object (WordPress.com, which this site uses, doesn't allow .CSV uploads):

mydata <-
structure(list(person_id = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L,
5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L),
event_name = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("post",
"pre"), class = "factor"), measure_1 = c(0.090292542033616,
0.772188909216967, 0.411215853517627, 0.893520693680764,
0.975884110589112, 0.191220783404496, 0.295130497585035,
0.281881693414804, NA, NA, 0.916251491858678, 0.855617042674353,
0.413819663482273, 0.566898921340026, NA, NA, NA, 0.669993262495757,
0.971111572214459, NA, 0.546906695553809, 0.633795367321454
), measure_2 = c(0.377937959429224, NA, 0.270269745259855,
NA, 0.109674141718118, 0.448669601639652, NA, 0.72685811303618,
NA, NA, 0.700375223453256, 0.768455206434367, 0.579069685794662,
NA, 0.330414772734107, 0.368096792956557, 0.833457074741303,
NA, 0.243890539290176, 0.569952201047387, NA, 0.876928368602336
), measure_3 = c(0.602297752478175, 0.181189423838694, 0.181573287464944,
0.622056657187667, 0.04365317661039, 0.822932438391326, NA,
0.811139221693022, NA, 0.147579313853287, 0.485940044084596,
0.569365973953375, 0.665967761846407, 0.674156039301903,
NA, 0.009820636976389, 0.647143287549235, NA, 0.112149360850571,
0.812318140328899, 0.310799441519015, 0.831540553415625),
measure_4 = c(0.713749416173432, 0.132289864624871, 0.228154179611478,
NA, NA, 0.763483651278373, 0.716920086339759, 0.418525713002487,
NA, NA, 0.273025665120284, NA, 0.774730722591496, 0.159313766167581,
0.511725395992406, 0.024104494213395, 0.301143075193437,
0.77578513949998, 0.082894542625418, 0.174816749583109, NA,
0.539324331277699), measure_5 = c(0.223198752914979, 0.12268823934171,
0.427927026044712, NA, 0.672875934541559, 0.519941185969767,
NA, 0.973576029201247, NA, 0.995673438864579, NA, 0.734839733202807,
0.292923203306682, 0.812726500255221, NA, 0.259965894270834,
0.542994025571332, 0.44094878049262, 0.631297956018848, 0.059709396864531,
0.351999845140519, 0.389170170663462), measure_6 = c(0.510191758746456,
0.510559442317453, 0.596252842614878, 0.120855890248108,
0.108096011859314, NA, 0.086334719168399, 0.598323992943591,
NA, 0.305347491114368, 0.353286627193526, NA, 0.595103226162614,
0.534444702374887, 0.980652728883323, 0.583962675549181,
NA, 0.527858131032994, 0.263301660275943, 0.495232583302992,
NA, NA), measure_7 = c(0.085136750949117, 0.268315785131163,
0.551406191204531, NA, 0.753685320493511, 0.299960419856773,
0.748196215710084, 0.342941651586594, NA, 0.638529984715728,
0.54176366820641, NA, 0.13865664857284, 0.659176171485113,
0.765870319104541, 0.882919235765678, 0.111788132851019,
NA, 0.78384464564426, 0.622248554039412, 0.055687196751792,
0.42452167898173)), .Names = c("person_id", "event_name",
"measure_1", "measure_2", "measure_3", "measure_4", "measure_5",
"measure_6", "measure_7"), class = "data.frame", row.names = c(NA,
-22L))

Then, we’ll explore transformations of those data:

# 2017-04-26
# An example: Changing long-ish data to wide

# Load our example dataset
# Note: For this post, you can get the data directly using the code chunk above:
# mydata <- read.csv( 	"~/path/to/2017-04-26/Dummy_Data.csv", 	header = TRUE ) # View(mydata) # View our data # Example of transposing the data: # Transposing (as in MS Office's Excel or LibreOffice Calc's "Paste Special -> Transpose" features) takes the dataset and flips it, moving its top right corner to the bottom left. See https://en.wikipedia.org/wiki/File:Matrix_transpose.gif for an animated explanation.
View(
	t(mydata)
)

# We can make our data wide (one row per person) using the reshape() command:
mydata_wide <- reshape(
	mydata,
	idvar = "person_id",
	timevar = "event_name",
	direction = "wide"
)

View(mydata_wide)

# Here's one way to aggregate all of the "pre-test" variables in this wide version of the data:

	column_names_containing_pre <- grep( # Grep searches for text.
		".pre",
		colnames(mydata_wide),
		value = TRUE # Return the value itself, rather than the index number of each matching element.
	)
	column_names_containing_pre # Check our work

	mydata_wide$pre_measures_mean <- apply(
		mydata_wide[
			,
			column_names_containing_pre
			],
		1, # ?apply states that '1' here means 'iterate over each row'; '2' here means 'iterate over each column'.
		mean,
			na.rm = TRUE # This is for the mean() function.
	)

	mydata_wide$pre_measures_mean[
		#mydata_long$pre_measures_mean == 0
		is.nan(mydata_wide$pre_measures_mean)
		] <- NA

	# Check our work:
	View(
		mydata_wide[,
			c(
				column_names_containing_pre,
				"pre_measures_mean"
			)
		]
	)

# If we wanted to aggregate the data in some way more straightforwardly, we could use the reshape *package* (vs. the reshape *function* above), which contains a pair of functions, melt() and cast().
library(reshape)

mydata_long <- melt(
	mydata,
	id=c("person_id","event_name")
)

View(
	cast(
		mydata_long,
		formula = person_id ~ event_name,
		mean,
		na.rm = TRUE # This is for the mean() function.
	)
)

write.csv( # We can write out our wide dataset from above using write.csv()
	mydata_wide,
	"~/Downloads/mydata_wide.csv",
	row.names = FALSE,
	na = ""
)
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s