/***************************************************************************/
/* 		This code is part of WWW graber called pavuk		   */
/*		Copyright (c) 1997,1998,1999 Ondrejicka Stefan		   */
/*		(ondrej@idata.sk)					   */
/*		Distributed under GPL 2 or later			   */
/***************************************************************************/

#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>

#ifdef HAVE_SYS_MODE_H
#include <sys/mode.h>
#endif
 
#include "config.h"
#include "condition.h"
#include "mime.h"
#include "robots.h"
#include "url.h"
#include "tools.h"
#include "doc.h"
#include "abstract.h"
#include "tools.h"
#include "mode.h"
#include "times.h"
#include "errcode.h"
#include "recurse.h"

static char *get_robots();
static char ** parse_robots();

static robotlim **robots = NULL;

/***************************************************/
/* kontrola ci URL splna podmienky pre WWW robotov */
/***************************************************/

bool check_robots(urlp)
url *urlp;
{
	char **p,*pom;
	int i = 0;
	robotlim **r = robots;
	
	if ((urlp->type != URLT_HTTP && urlp->type != URLT_HTTPS) || !cfg.condition.allow_robots) return TRUE;

	if (r)
	{
		while (r[i] && !(!strcmp(r[i]->site, urlp->p.http.host) && 
			(r[i]->port == urlp->p.http.port))) i++;

	}

	if (!r || !r[i])
	{
		r = robots = (robotlim **) _realloc(robots , 
			(i + 2) * sizeof(robotlim *));

		r[i+1] = NULL;
		r[i] = (robotlim *) _malloc(sizeof(robotlim));
		r[i]->site = urlp->p.http.host;
		r[i]->port = urlp->p.http.port;
		r[i]->pat =  NULL;
		
		pom = get_robots(urlp);
		if (pom)
		{
			r[i]->pat = parse_robots("pavuk" , pom);
			free(pom);
		}
	}
	
	p = robots[i]->pat;

	if (!p) return TRUE;

	while (*p)
	{
		if (!strncmp(*p, urlp->p.http.document, strlen(*p))) return FALSE;
		p++;
	}
	
	return TRUE;
}

/************************************************/
/* prenos suboru "robots.txt" pre dane URL	*/
/************************************************/

static char * get_robots(urlp)
url *urlp;
{
	url *purl = _malloc(sizeof(url));
	doc docu;
	int rstat;
	char *ret=NULL;
	char *pom;
	int nredir = 0,nreget = 0;
	struct stat estat;
	char *pp;
	int f;
	global_connection_info con_info;

#ifdef I_FACE
	if (cfg.xi_face)
	{
		iface_set_what(gettext("transfering \"robots.txt\""));
	}
#endif	
	xprintf(1 , gettext("transfering \"robots.txt\"\n"));
	
	purl->type = urlp->type;
	purl->parent_url = _malloc(sizeof(url *));
	purl->parent_url[0] = NULL;
	purl->status = URL_INLINE_OBJ; /*** required if -store_name option  used ***/
	purl->extension = NULL;
	purl->local_name = NULL;
	
#ifdef WITH_TREE
#ifdef I_FACE
	purl->prop = NULL;
	purl->tree_nfo = NULL;
#endif
#endif

	purl->level = 0;
	purl->p.http.user = new_string(urlp->p.http.user);
	purl->p.http.password = new_string(urlp->p.http.password);
	purl->p.http.host = new_string(urlp->p.http.host);
	purl->p.http.port = urlp->p.http.port;
	purl->p.http.document = new_string("/robots.txt");
	purl->p.http.anchor_name = NULL;
	purl->p.http.searchstr = NULL;
	
	doc_init(&docu, purl);
        docu.save_online = FALSE;
        docu.report_size = FALSE;

	if (cfg.mode == MODE_SYNC)
	{
		pp = url_to_filename(purl , TRUE);
		if (!stat(pp , &estat) && !S_ISDIR(estat.st_mode))
		{
			docu.dtime = estat.st_mtime;
		}
	}

	init_global_connection_data(&con_info);

	while ((rstat = doc_download(&docu, TRUE , FALSE)) && 
		( (nredir < cfg.nredir && docu.errcode == ERR_HTTP_REDIR) || 
		  (nreget < cfg.nreget && docu.errcode == ERR_HTTP_TRUNC)))
	{
		if (docu.errcode) report_error(&docu, "robots.txt");

		save_global_connection_data(&con_info, &docu);

		nredir += docu.errcode == ERR_HTTP_REDIR;
		nreget += docu.errcode == ERR_HTTP_TRUNC;

		if (docu.errcode == ERR_HTTP_REDIR)
		{
			pom = url_to_urlstr(purl , FALSE);
			xprintf(1 , gettext("Hmm: redirecting \"robots.txt\" to %s ???\n") , pom);
			_free(pom);

			docu.doc_url = docu.doc_url->moved_to;
		}

		_free(docu.contents);
		_free(docu.mime);
		_free(docu.type_str);

		doc_remove_lock(&docu);

		if (cfg.mode == MODE_SYNC)
		{
			pp = url_to_filename(purl , TRUE);
			if (!stat(pp , &estat) && !S_ISDIR(estat.st_mode))
			{
				docu.dtime = estat.st_mtime;
			}
		}
		restore_global_connection_data(&con_info, &docu);
	}

	if (docu.errcode) report_error(&docu, "robots.txt");

	save_global_connection_data(&con_info, &docu);
	kill_global_connection_data(&con_info);

	if (!rstat)
	{
		doc_store(&docu,TRUE);
		ret =  docu.contents;
	}
	else if (docu.errcode == ERR_HTTP_NFOUND ||
		 docu.errcode == ERR_HTTP_GONE)
	{
		pp = url_to_filename(purl , TRUE);

		if ((f = open(pp , O_BINARY | O_CREAT | O_TRUNC | O_WRONLY , S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR)) > 0)
			close(f);
	}

	doc_remove_lock(&docu);

	_free(docu.type_str);
	_free(docu.mime);
	free_deep_url(purl);
	free(purl);
	return ret;
}

/*******************************/
/* analyza suboru "robots.txt" */
/*******************************/

static char **parse_robots(agent,file)
char *agent;
char *file;
{
	char *p,*p1;
	char **ret =  NULL;
	bool is_me = FALSE;
	int n_ret=0;
	bool last = 1;
	int ilen;

	p = file;
	while(*p)
	{
		ilen = strcspn(p , "\r\n");
		if (*(p+ilen)) *(p+ilen) = '\0';
		else last = 0;
 
		while(*p == ' ' || *p == '\t') p++;

		if (!*p)
		{
			is_me = FALSE;
		}
		else if (!strncasecmp("User-Agent: " , p , 12))
		{
			p += 12;
			while(*p == ' ' || *p == '\t') p++;
			p1 = p+strlen(p);
			while(*p1 == ' ' || *p1 == '\t') { *p1 = '\0' ; p1--;}

			if (*p == '*') is_me = TRUE;
			else if (!strncmp(agent,p,strlen(agent))) is_me = TRUE;
		}
		else if (is_me && !strncasecmp("Disallow: " , p , 10))
		{
			p += 10;
			while(*p == ' ' || *p == '\t') p++;
			p1 = p+strlen(p);
			while(*p1 == ' ' || *p1 == '\t') { *p1 = '\0' ; p1--;}

			if (*p)
			{
				ret = (char **)
					_realloc(ret, (n_ret + 2) * sizeof(char *));
				ret[n_ret+1] = NULL;
				ret[n_ret] = new_string(p);
				n_ret++;
			}
		}
		p += ilen+last;
		p += strspn(p , "\n\r");
	}
	return ret;
}

