Post CSV Writer

Social Media

Purpose:

This program takes raw Facebook and Instagram data (from "Download my Information" on both platforms) and writes the posts' date, description, and uri to a spreadsheet for easier manipulation.

Dependencies

Python (with json and datetime modules), ftfy

# REQUIRES fix_text FROM ftfy (pip install ftfy)
# NOTE since it is a CSV, it must replace all "," instances in the description field with "`"'s.
# Whenever the data is used again, change "`"'s in the description field back to commas.
import json
from datetime import datetime
from ftfy import fix_text

post_directory = "#DIRECTORY OF POSTS JSON FILE - ex. posts_1.json#"
base = "#DIRECTORY OF THE BASE FACEBOOK FOLDER ON MACHINE#"
photo_descriptions_csv = "posts.csv"

# Opens the post directory and loads the json data to data.
j = open(post_directory)
data = json.load(j)

# Opens or creates a new csv at the base directory to add the posts to. If you want it to add to the table, change "w" to "a".
f = open(photo_descriptions_csv, "w")

# Initializes some variables.
post_count = 0
error_count = 0

# Corrects for encoded emoji's and @'s in a post's description.
def fix_all(description):
description = fix_text(description)
if "@[" in description:
description = fix_at(description)
return(description)

# Fixes the encoded @'s that sometimes show up in post captions.
def fix_at(description):
while "@[" in description:
at_start = description.find("@[") + len("@[")
temp_at = description[at_start:]
at_end = temp_at.find("]")
at = ((temp_at[:at_end]))
# Removes excess ":" from the @
while ":" in at:
at = at[(at.find(":")+1):]
# Once it is finished cleaning up the @, it replaces that encoded @ instance with the proper @ text.
encoded_at_start = description.find("@[")+1
temp_encoded_at = description[encoded_at_start:]
encoded_at_end = temp_encoded_at.find("]")+1
encoded_at = ((temp_encoded_at[:encoded_at_end]))
description = description.replace(encoded_at, at)
# Once all of the @'s have been corrected, it will return the corrected description.
return(description)

# Returns the description in a particular post's JSON data.
# This is the first location that a description could be in a post's JSON data.
def a(memory):
try:
for attachement in i['attachments']:
for info in attachement['data']:
description = info['media']['description']
return(description)
except:
description = ""
return(description)

# Returns the description in a particular post's JSON data.
# This is the second location that a description could be in a post's JSON data.
def b(memory):
try:
for info in i['data']:
description = info['post']
return(description)
except:
description = ""
return(description)

# Takes the whole JSON data (memory) for a post and returns the description.
def find_des(memory):
try:
# The description can be in two (or neither) places in teh JSON data, so it tests both cases.
# If it finds a description in one of the cases, it fixes the @'s and emoji's, and then returns it.
case1 = a(memory)
case2 = b(memory)
if (case1 != ""):
description = case1
description = fix_all(description)
return(description)
elif (case2 != ""):
description = case2
description = fix_all(description)
return(description)
else:
return("")
# If the JSON data doesn't conform to the first two cases, then this exception returns an empty description.
except:
description = ""
return(description)

# Loops through all of the JSON data and writes the posts' information to a CSV table.
for i in data:
# Find's the post's post date and description.
description = find_des(i)
timestamp = i['timestamp']
timestamp = datetime.utcfromtimestamp(timestamp).strftime('%Y:%m:%d %H:%M:%S')
# Since each post may have multiple attachments/items, this will make sure each attachment is written to the CSV with the proper post data.
try:
for attachment in i['attachments']:
for item in attachment['data']:
post_count += 1
proceed = True
# The uri is the file's location in the "base" directory.
uri = item['media']['uri']
# Since this program will not save the data properly for videos, it skips their instances.
if "/videos/" in uri:
proceed = False
if proceed = True:
full_path = base + uri
no_commas = description.replace(",", "`")
f.write(full_path + "," + no_commas.replace("\n", "\\n") + "," + timestamp + "," + "\n")
# If there is an issue looping through the attachments, this exception adds 1 to the error_count and writes the file the CSV without the missing INFO.
except:
uri = ""
error_count += 1
no_commas = description.replace(",", "`")
# Writes the data to the csv.
f.write(uri + "," + no_commas.replace("\n", "\\n") + "," + timestamp + "," + "\n")

# Prints the program's post and error counts
print("Found " + post_count + " posts!")
print(error_count + " ERRORS")