matrix_thing/src/reddit.cpp

/**
	This file is a part of rexy's matrix bot
	Copyright (C) 2019 rexy712

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU Affero General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU Affero General Public License for more details.

	You should have received a copy of the GNU Affero General Public License
	along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "reddit.hpp"
#include "raii/rjp_string.hpp"
#include "raii/string.hpp"
#include "raii/curler.hpp"
#include "raii/rjp_ptr.hpp"
#include "raii/static_string.hpp"

#include <algorithm> //search
#include <cstring>

//no idea if this covers everything reddit might dish out at me
//there is no consistency in their content tagging. there are gifs marked as images, others as videos
//they separate audio and video streams for their hosted videos, there is no true way to tell
//what kind of content a post contains since the post_hint field might be completely nonexistent.
//it's just a game of hacking together solutions each time reddit throws me a new type of unexpected complication.

namespace reddit{

	namespace time{
		period hour = "hour";
		period day = "day";
		period week = "week";
		period month = "month";
		period year = "year";
		period all = "all";
	}

	auth_data parse_auth_data(RJP_value* root){
		static const char* account_names[2] = {"bot", "account"};
		static const char* account_fields[2] = {"username", "password"};

		auth_data ret;
		RJP_search_res accounts[2];
		RJP_search_res details[2];
		rjp_search_members(root, 2, account_names, accounts, 0);

		rjp_search_members(accounts[0].value, 2, account_fields, details, 0);
		ret.bot_name = details[0].value;
		ret.bot_pass = details[1].value;

		rjp_search_members(accounts[1].value, 2, account_fields, details, 0);
		ret.acc_name = details[0].value;
		ret.acc_pass = details[1].value;

		return ret;
	}


	static raii::rjp_string media_search(RJP_value* media){
		if(!media)
			return {};
		RJP_search_res res = rjp_search_member(media, "reddit_video", 0);
		if(!res.value)
			return raii::rjp_string{};
		res = rjp_search_member(res.value, "fallback_url", 0);
		if(!res.value)
			return raii::rjp_string{};
		return raii::rjp_string(res.value);
	}
	static raii::rjp_string preview_search(RJP_value* root){
		RJP_search_res media = rjp_search_member(root, "preview", 0);
		if(!media.value)
			return raii::rjp_string{};
		media = rjp_search_member(media.value, "reddit_video_preview", 0);
		if(!media.value)
			return raii::rjp_string{};
		media = rjp_search_member(media.value, "fallback_url", 0);
		if(!media.value)
			return raii::rjp_string{};
		return raii::rjp_string(media.value);
	}
	static bool check_reddit_media_domain(RJP_value* root){
		RJP_search_res res = rjp_search_member(root, "is_reddit_media_domain", 0);
		return (res.value && rjp_value_boolean(res.value));
	}

	static raii::rjp_string find_video_url(RJP_value* root){
		RJP_search_res media = rjp_search_member(root, "media", 0);
		if(raii::rjp_string res = media_search(media.value)){
			return res;
		}
		raii::rjp_string res = preview_search(root);
		return res;
	}
	static bool is_gifv(const raii::string_base& str){
		const char* s = str.get();
		size_t len = str.length();
		if(len > 5 &&
		   *(s+len-1) == 'v' &&
		   *(s+len-2) == 'f' &&
		   *(s+len-3) == 'i' &&
		   *(s+len-4) == 'g' &&
		   *(s+len-5) == '.')
		{
			return true;
		}
		return false;
	}
	static bool has_extension(const raii::string_base& str){
		size_t i = 0;
		for(const char* p = str.get() + str.length() - 1;*p && i < 6;--p,++i){
			if(*p == '/')
				return false;
			else if(*p == '.')
				return true;
		}
		return false;
	}
	static bool is_gfycat_link(const raii::string_base& str){
		static const char gfycat[] = "gfycat.com";
		return *std::search(str.get(), str.get()+str.length(), gfycat, gfycat+sizeof(gfycat)-1) != 0;
	}
	static bool is_imgur_link(const raii::string_base& str){
		static const char imgur[] = "imgur.com";
		return *std::search(str.get(), str.get()+str.length(), imgur, imgur+sizeof(imgur)-1) != 0;
	}
	static bool is_direct_imgur_link(const raii::string_base& str){
		return is_imgur_link(str) && has_extension(str);
	}


	post::post(const raii::string_base& p):
		m_post(p)
	{
		_parse_post();
	}
	post::post(raii::string_base&& p):
		m_post(std::move(p)),
		m_type(post_type::unrecognized)
	{
		_parse_post();
	}
	post& post::operator=(const raii::string_base& p){
		post tmp(p);
		if(!tmp)
			return *this;
		return (*this = std::move(tmp));
	}
	post::operator bool(void)const{
		if(m_type == post_type::text)
			return (m_post_url && m_author && m_title && m_name);
		else
			return (m_post_url && m_media_url && m_author && m_title && m_name);
	}

	const raii::string& post::raw(void)const{
		return m_post;
	}
	const raii::rjp_string& post::mediaurl(void)const{
		return m_media_url;
	}
	const raii::string& post::hosted_video_audio(void)const{
		return m_hosted_video_audio;
	}
	const raii::rjp_string& post::posturl(void)const{
		return m_post_url;
	}
	const raii::rjp_string& post::author(void)const{
		return m_author;
	}
	const raii::rjp_string& post::post_hint(void)const{
		return m_post_hint;
	}
	const raii::rjp_string& post::title(void)const{
		return m_title;
	}
	const raii::rjp_string& post::name(void)const{
		return m_name;
	}
	bool post::is_crosspost(void)const{
		return (m_flags & POST_FLAGS_CROSSPOSTED);
	}
	post_type post::type(void)const{
		return m_type;
	}
	void post::_parse_post(void){
		raii::rjp_ptr root(rjp_parse(m_post));
		if(!root)
			return;

		static const char* search_items[] = {"url", "author", "post_hint", "title", "id", "crosspost_parent_list"};
		static constexpr size_t num_searches = sizeof(search_items)/sizeof(search_items[0]);
		RJP_search_res results[num_searches];
		RJP_search_res data = rjp_search_member(root.get(), "data", 0);
		if(!data.value) return;
		data = rjp_search_member(data.value, "children", 0);
		if(!data.value) return;
		data.value = rjp_get_element(data.value);
		if(!data.value) return;
		RJP_search_res kind = rjp_search_member(data.value, "kind", 0);
		if(!kind.value) return;
		data = rjp_search_member(data.value, "data", 0);
		if(!data.value) return;

		RJP_search_res& crosspost = results[5];

		rjp_search_members(data.value, num_searches, search_items, results, 0);

		//reddit will *sometimes* make the url field point to the crosspost parent's comments page.
		//so we just always assume that the true link is in the crosspost parent
		if(crosspost.value){
			m_flags |= POST_FLAGS_CROSSPOSTED;
			crosspost.value = rjp_get_element(crosspost.value);
			crosspost = rjp_search_member(crosspost.value, "url", 0);
			if(crosspost.value)
				m_media_url = crosspost.value;
		}else{
			m_media_url = results[0].value;
		}
		m_author = results[1].value;
		m_post_hint = results[2].value;
		m_title = results[3].value;
		m_name = raii::rjp_string(kind.value) + "_" + rjp_value_string(results[4].value);
		m_post_url = "https://redd.it/" + raii::rjp_string(results[4].value);

		if(m_post_hint){
			//handle simple image
			if(!strcmp(m_post_hint, "image")){
				m_type = post_type::image;
			}
			//handle link
			else if(!strcmp(m_post_hint, "link")){
				m_type = post_type::link;

				//imgur support
				if(is_imgur_link(m_media_url)){
					if(is_gifv(m_media_url)){ //gifv is a video
						if(raii::rjp_string tmp = preview_search(data.value)){
							m_media_url = std::move(tmp);
							m_type = post_type::video;
						}
					}else{
						//imgur links don't lead to the image source. adding .jpg to the link leads to the source
						//except when the link is to an album or to a gifv
						m_media_url += ".jpg"; //imgur is dumb
						m_type = post_type::image;
					}
				//gfycat support
				}else if(is_gfycat_link(m_media_url)){
					if(raii::rjp_string tmp = find_video_url(data.value)){
						m_media_url = std::move(tmp);
						m_type = post_type::video;
					}
				}
			}
			//handle hosted video
			else if(!strcmp(m_post_hint, "hosted:video")){
				m_type = _handle_reddit_hosted_video(data.value, m_media_url, m_hosted_video_audio);
			}
			else if(!strcmp(m_post_hint, "rich:video")){
				RJP_search_res media = rjp_search_member(data.value, "media", 0);
				raii::rjp_string res = media_search(media.value);
				if(res){
					m_type = post_type::video;
					m_media_url = std::move(res);
					return;
				}
				res = preview_search(data.value);
				if(res){
					m_type = post_type::video;
					m_media_url = std::move(res);
					return;
				}
				m_type = post_type::link;
			}
			else{
				//assume text post for other
				m_type = post_type::text;
			}
		}else if(is_direct_imgur_link(m_media_url)){
			m_type = post_type::image;
			return;
		}else if(check_reddit_media_domain(data.value)){
			m_type = _handle_reddit_hosted_video(data.value, m_media_url, m_hosted_video_audio);
			/*RJP_value* media = rjp_search_member(data.value, "media", 0).value;
			if(media && (rjp_value_type(media) != json_null))
				m_type = post_type::video;
			else
				m_type = post_type::image;
			//*/
		}else{
			m_media_url.reset();
			m_type = post_type::text;
		}
	}
	post_type post::_handle_reddit_hosted_video(RJP_value* data, raii::rjp_string& media_url, raii::string& audio_url){

		RJP_search_res media = rjp_search_member(data, "media", 0);
		RJP_search_res gif = rjp_search_member(media.value, "reddit_video", 0);

		//treat gif as image even though reddit thinks they're videos
		if(gif.value)
			gif = rjp_search_member(media.value, "is_gif", 0);
		if(gif.value && rjp_value_boolean(gif.value)){
			return post_type::image;
		}
		raii::rjp_string res = media_search(media.value);
		if(!res){
			res = preview_search(data);
			if(!res){
				return post_type::link;
			}
		}
		media_url = std::move(res);

		//reddit hosts audio and video separately. Meaning I have to find a way to manually recombine them.
		//this sets up a link to the audio source of the video. the video might not actually have audio. when downloading
		//from the audio link, always make sure to check for 404 errors.
		static constexpr char url_base[] = "https://v.redd.it/";
		static constexpr size_t url_base_len = sizeof(url_base)-1;
		char* end = strstr(media_url.get()+url_base_len, "/");
		if(!end)
			end = media_url.get()+media_url.length();
		size_t len = end - media_url.get();
		audio_url = raii::string(len + 6);
		memcpy(audio_url.get(), media_url.get(), len);
		memcpy(audio_url.get()+len, "/audio", 6);
		audio_url[len+6] = 0;
		return post_type::video;
	}


	bot::bot(const auth_data& a, const raii::string_base& useragent):
		m_curl(),
		m_useragent(useragent),
		m_access_token(_acquire_access_token(a)){}
	bot::bot(const auth_data& a, raii::string_base&& useragent):
		m_curl(),
		m_useragent(std::move(useragent)),
		m_access_token(_acquire_access_token(a)){}
	bot::bot(const bot& b):
		m_curl(b.m_curl),
		m_useragent(b.m_useragent),
		m_access_token(b.m_access_token){}
	bot::bot(bot&& b):
		m_curl(std::move(b.m_curl)),
		m_useragent(std::move(b.m_useragent)),
		m_access_token(std::move(b.m_access_token)){}

	bot& bot::operator=(bot&& b){
		m_useragent = std::move(b.m_useragent);
		m_access_token = std::move(b.m_access_token);
		return *this;
	}
	bot& bot::operator=(const bot& b){
		bot tmp(b);
		return *this = std::move(tmp);
	}

	const raii::rjp_string& bot::access_token(void)const{
		return m_access_token;
	}
	const raii::string& bot::useragent(void)const{
		return m_useragent;
	}
	void bot::set_useragent(const raii::string_base& s){
		m_useragent = s;
	}
	void bot::set_useragent(raii::string_base&& s){
		m_useragent = std::move(s);
	}

	void bot::refresh_token(const auth_data& a){
		m_access_token = _acquire_access_token(a);
	}

	post bot::get_new_post(const raii::string_base& subreddit){
		return _get_post(subreddit, "new"_ss, "limit=1"_ss);
	}
	post bot::get_new_post(const raii::string_base& subreddit, const raii::string_base& after){
		return _get_post(subreddit, "new"_ss, raii::string("limit=1&after=" + after));
	}
	post bot::get_hot_post(const raii::string_base& subreddit){
		return _get_post(subreddit, "hot"_ss, "limit=1"_ss);
	}
	post bot::get_hot_post(const raii::string_base& subreddit, const raii::string_base& after){
		return _get_post(subreddit, "hot"_ss, raii::string("limit=1&after=" + after));
	}
	post bot::get_rising_post(const raii::string_base& subreddit){
		return _get_post(subreddit, "rising"_ss, "limit=1"_ss);
	}
	post bot::get_rising_post(const raii::string_base& subreddit, const raii::string_base& after){
		return _get_post(subreddit, "rising"_ss, raii::string("limit=1&after=" + after));
	}
	post bot::get_best_post(const raii::string_base& subreddit){
		return _get_post(subreddit, "best"_ss, "limit=1"_ss);
	}
	post bot::get_best_post(const raii::string_base& subreddit, const raii::string_base& after){
		return _get_post(subreddit, "best"_ss, raii::string("limit=1&after=" + after));
	}
	post bot::get_top_post(const raii::string_base& subreddit, time::period period){
		raii::static_string pstr = period.get();
		return _get_post(subreddit, "top"_ss, raii::string("limit=1&t=" + pstr));
	}
	post bot::get_top_post(const raii::string_base& subreddit, const raii::string_base& after, time::period period){
		raii::static_string pstr = period.get();
		return _get_post(subreddit, "top"_ss, raii::string("limit=1&t=" + pstr + "&after=" + after));
	}
	post bot::get_controversial_post(const raii::string_base& subreddit, time::period period){
		raii::static_string pstr = period.get();
		return _get_post(subreddit, "controversial"_ss, raii::string("limit=1&t=" + pstr));
	}
	post bot::get_controversial_post(const raii::string_base& subreddit, const raii::string_base& after, time::period period){
		raii::static_string pstr = period.get();
		return _get_post(subreddit, "controversial"_ss, raii::string("limit=1&t=" + pstr + "&after=" + after));
	}


	post bot::_get_post(const raii::string_base& subreddit, const raii::string_base& category, const raii::string_base& extra){
		raii::string rep;
		static constexpr char url_base[] = "https://oauth.reddit.com/r/";
		raii::string url;
		if(extra)
			url = (url_base + subreddit) + "/" + category + "?" + extra;
		else
			url = (url_base + subreddit) + "/" + category;
		raii::curl_llist header(_create_auth_header(m_access_token));
		m_curl.reset();
		_setup_subreddit_get_curl(header, url, rep);
		m_curl.perform();
		return post(rep);
	}
	size_t bot::_get_response_curl_callback(char* ptr, size_t size, size_t nmemb, void* userdata){
		raii::rjp_string* reply = reinterpret_cast<raii::rjp_string*>(userdata);
		(*reply) += ptr;
		return size*nmemb;
	}
	raii::curl_llist bot::_create_auth_header(const raii::string_base& access_token){
		return raii::curl_llist(raii::string("Authorization: bearer " + access_token));
	}
	void bot::_setup_subreddit_get_curl(const raii::curl_llist& header, const raii::string_base& url, const raii::string_base& reply){
		m_curl.seturl(url);
	  m_curl.setopt(CURLOPT_BUFFERSIZE, 102400L);
	  m_curl.setopt(CURLOPT_NOPROGRESS, 1L);
	  m_curl.setopt(CURLOPT_MAXREDIRS, 50L);
		m_curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
		m_curl.forcessl(CURL_SSLVERSION_TLSv1_2);
	  m_curl.setopt(CURLOPT_TCP_KEEPALIVE, 1L);
		m_curl.setheader(header);
		m_curl.setuseragent(m_useragent);
		m_curl.setopt(CURLOPT_WRITEFUNCTION, _get_response_curl_callback);
		m_curl.setopt(CURLOPT_WRITEDATA, &reply);
		m_curl.setopt(CURLOPT_FAILONERROR, 1L);
	}

	size_t bot::_post_reply_curl_callback(char* ptr, size_t size, size_t nmemb, void* userdata){
		raii::string* data = reinterpret_cast<raii::string*>(userdata);
		(*data) += ptr;
		return size*nmemb;
	}
	//Create reddit login data
	raii::string bot::_create_request_post_data(const raii::string_base& account_name, const raii::string_base& account_pass){
		return raii::string("grant_type=password&username=" + account_name + "&password=" + account_pass);
	}
	//Setup login data for reddit bot
	raii::string bot::_create_request_userpwd(const raii::string_base& bot_name, const raii::string_base& bot_pass){
		return raii::string(bot_name + ":" + bot_pass);
	}
	void bot::_setup_token_request_curl(const raii::string_base& userpwd, const raii::string_base& postdata, void* result){
		static constexpr char reddit_token_address[] = "https://www.reddit.com/api/v1/access_token";
	  m_curl.setopt(CURLOPT_BUFFERSIZE, 102400L);
	  m_curl.seturl(reddit_token_address);
	  m_curl.setopt(CURLOPT_NOPROGRESS, 1L);
	  m_curl.setuserpwd(userpwd);
		m_curl.setpostdata(postdata);
	  m_curl.setuseragent(m_useragent);
	  m_curl.setopt(CURLOPT_MAXREDIRS, 50L);
		m_curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
		m_curl.forcessl(CURL_SSLVERSION_TLSv1_2);
	  m_curl.setopt(CURLOPT_CUSTOMREQUEST, "POST");
	  m_curl.setopt(CURLOPT_TCP_KEEPALIVE, 1L);
		m_curl.setopt(CURLOPT_WRITEFUNCTION, _post_reply_curl_callback);
		m_curl.setopt(CURLOPT_WRITEDATA, result);
		m_curl.setopt(CURLOPT_FAILONERROR, 1L);
	}

	raii::string bot::_request_access_token(const auth_data& auth){
		CURLcode result;

		//URL encode the POST data
		raii::curl_string acc_name = m_curl.encode(auth.acc_name, auth.acc_name.length());
		raii::curl_string acc_pass = m_curl.encode(auth.acc_pass, auth.acc_pass.length());

		//unify the post data, clean up remnants
		raii::string postdata = _create_request_post_data(acc_name, acc_pass);
		acc_name.reset();
		acc_pass.reset();

		//Unify the username/password
		raii::string userpwd = _create_request_userpwd(auth.bot_name, auth.bot_pass);

		//Load curl with data then run POST operation
		raii::string reply;
		_setup_token_request_curl(userpwd, postdata, &reply);
		result = m_curl.perform();

		if(result != CURLE_OK)
			return {};
		return reply;
	}

	raii::rjp_string bot::_acquire_access_token(const auth_data& a){
		raii::string reply = _request_access_token(a);
		if(!reply)
			return raii::rjp_string{};

		raii::rjp_ptr root(rjp_parse(reply));
		if(!root)
			return raii::rjp_string{};
		RJP_search_res token = rjp_search_member(root.get(), "access_token", 0);
		return raii::rjp_string{token.value};
	}
}