Post CSV Writer

Social Media
Purpose:

This program takes raw Facebook and Instagram data (from "Download my Information" on both platforms) and writes the posts' date, description, and uri to a spreadsheet for easier manipulation.

Dependencies

Python (with json and datetime modules), ftfy

# REQUIRES fix_text FROM ftfy (pip install ftfy)
# NOTE since it is a CSV, it must replace all "," instances in the description field with "`"'s.
# Whenever the data is used again, change "`"'s in the description field back to commas.
import json
from datetime import datetime
from ftfy import fix_text

post_directory = "#DIRECTORY OF POSTS JSON FILE - ex. posts_1.json#"
base = "#DIRECTORY OF THE BASE FACEBOOK FOLDER ON MACHINE#"
photo_descriptions_csv = "posts.csv"

# Opens the post directory and loads the json data to data.
j = open(post_directory)
data = json.load(j)

# Opens or creates a new csv at the base directory to add the posts to. If you want it to add to the table, change "w" to "a".
f = open(photo_descriptions_csv, "w")

# Initializes some variables.
post_count = 0
error_count = 0

# Corrects for encoded emoji's and @'s in a post's description.
def fix_all(description):
   description = fix_text(description)
   if "@[" in description:
       description = fix_at(description)
   return(description)

# Fixes the encoded @'s that sometimes show up in post captions.
def fix_at(description):
   while "@[" in description:
       at_start = description.find("@[") + len("@[")
       temp_at = description[at_start:]
       at_end = temp_at.find("]")
       at = ((temp_at[:at_end]))
       # Removes excess ":" from the @
       while ":" in at:
           at = at[(at.find(":")+1):]
       # Once it is finished cleaning up the @, it replaces that encoded @ instance with the proper @ text.
       encoded_at_start = description.find("@[")+1
       temp_encoded_at = description[encoded_at_start:]
       encoded_at_end = temp_encoded_at.find("]")+1
       encoded_at = ((temp_encoded_at[:encoded_at_end]))
       description = description.replace(encoded_at, at)
   # Once all of the @'s have been corrected, it will return the corrected description.
   return(description)


# Returns the description in a particular post's JSON data.
# This is the first location that a description could be in a post's JSON data.
def a(memory):
   try:
       for attachement in i['attachments']:
           for info in attachement['data']:
               description = info['media']['description']
               return(description)
   except:
       description = ""
       return(description)

# Returns the description in a particular post's JSON data.
# This is the second location that a description could be in a post's JSON data.
def b(memory):
   try:
       for info in i['data']:
           description = info['post']
           return(description)
   except:
       description = ""
       return(description)

# Takes the whole JSON data (memory) for a post and returns the description.
def find_des(memory):
   try:
       # The description can be in two (or neither) places in teh JSON data, so it tests both cases.
       # If it finds a description in one of the cases, it fixes the @'s and emoji's, and then returns it.
       case1 = a(memory)
       case2 = b(memory)
       if (case1 != ""):
           description = case1
           description = fix_all(description)
           return(description)
       elif (case2 != ""):
           description = case2
           description = fix_all(description)
           return(description)
       else:
           return("")
   # If the JSON data doesn't conform to the first two cases, then this exception returns an empty description.
   except:
       description = ""
       return(description)

# Loops through all of the JSON data and writes the posts' information to a CSV table.
for i in data:
   # Find's the post's post date and description.
   description = find_des(i)
   timestamp = i['timestamp']
   timestamp = datetime.utcfromtimestamp(timestamp).strftime('%Y:%m:%d %H:%M:%S')
   # Since each post may have multiple attachments/items, this will make sure each attachment is written to the CSV with the proper post data.
   try:
       for attachment in i['attachments']:
           for item in attachment['data']:
               post_count += 1
               proceed = True
               # The uri is the file's location in the "base" directory.
               uri = item['media']['uri']
               # Since this program will not save the data properly for videos, it skips their instances.
               if "/videos/" in uri:
                   proceed = False
               if proceed = True:
                   full_path = base + uri
                   no_commas = description.replace(",", "`")
                   f.write(full_path + "," + no_commas.replace("\n", "\\n") + "," + timestamp + "," + "\n")
   # If there is an issue looping through the attachments, this exception adds 1 to the error_count and writes the file the CSV without the missing INFO.
   except:
       uri = ""
       error_count += 1
       no_commas = description.replace(",", "`")
       # Writes the data to the csv.
       f.write(uri + "," + no_commas.replace("\n", "\\n") + "," + timestamp + "," + "\n")

# Prints the program's post and error counts
print("Found " + post_count + " posts!")
print(error_count + " ERRORS")