KGRKJGETMRETU895U-589TY5MIGM5JGB5SDFESFREWTGR54TY
Server : Apache/2.4.62
System : FreeBSD fbsdweb2.web.rcn.net 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64
User : www ( 80)
PHP Version : 8.3.8
Disable Function : NONE
Directory :  /domains/markrose/

Upload File :
current_dir [ Writeable ] document_root [ Writeable ]

 

Current File : /domains/markrose/markov.c
/*
**	MARKOV.C
**      (c) 2018 by Mark Rosenfelder
*/

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

#define TRUE 1
#define FALSE 0

typedef struct wd Word;

typedef struct sec /* Record of second words and their counts */
{
	Word *w;   // the word
	int   count;   // how many times we found it
	struct sec *next;
} Second;

typedef struct wd  /* Record of one word found */
{
	char *token;   // Pointer to the token itself
	Second *seconds;  // Words that follow it
} Word;


#define MAXTOKEN 40000

int nToken;				/* Number of words defined */
Word *tokens[MAXTOKEN];	/* List of words and counts */

Word *lastToken = NULL;


#define isPunc(ch)  (strchr("—,.;?!¿¡“”\"",ch))

Word *searchToken( char *token, int create )
/*	Find the token in the token list, using a binary search.
*
*	If the token is found, return a pointer to it.
*
*	If not, add it here.  We will know where to put it from where
*	the search stopped, and since the list of tokens is linear it's
*	a simple matter to update it: we just move everything down one.
*
*	Don’t add a new token if create is false.
*
*	If there are too many tokens to add another, we just return NULL.
*/
{
	int beg, mid, end, diff;
	
	if (nToken == 0)
	{
		/* Blank list?  Easy to see where to add the new entry. */
	
		mid = 0;
	}
	else
	{
		/* Search for the entry, using a binary search. */
			
		beg = 0;
		end = nToken - 1;
		
		while (beg <= end)
		{
			mid = (beg + end) / 2;
			
			diff = strcmp( token, tokens[mid]->token );
			
			if (!diff)						/* token == mid */
				return( tokens[mid] );
			else 
				if (diff > 0) 				/* token >  mid */
					beg = mid + 1;
				else
					end = mid - 1;			/* token <  mid */
		}
		
		/*	Once the loop fails, the new entry should be added
		 *	either at mid or at mid + 1, depending on the results
		 *	of the last search.
		 */
		
		if (diff > 0)
			mid++;
	}
	
	if (create) 
	{
		if (nToken < MAXTOKEN)
		{
			/* Add the new entry. */
			
			memmove( &tokens[mid+1], &tokens[mid], 
					 (nToken - mid) * sizeof(Word) );

			Word *w = malloc(sizeof(Word));
			if (!w) { printf("Out of memory\n"); exit(1); };

			w->seconds = NULL;		 
			w->token = malloc( strlen(token) + 1 );
			if (!w->token)  { printf("Out of memory\n"); exit(1); };
			strcpy( w->token, token );

			tokens[mid] = w;
			
			nToken++;
			
			return( w );
		}
		else
		{
			printf( "\nTOO MANY WORDS!\n" );
			exit(1);
		}
	}
	
	/* Too many entries, or no creation wanted.  Return. */
	
	return(NULL);
	
} /*searchToken*/


/* Find the word w in the list of seconds.  
 * If found, increment its count.
 * If not, add to the list.
 * This is an unsorted list, but we hope it doesn't grow too large.
 */
void incSecondList(Word *first, Word *second) 
{
	Second *sec;
	Second *last = NULL;
	
	/* Linear search for second word */
	for (sec = first->seconds; sec; sec = sec->next)
	{
		if (!strcmp(sec->w->token, second->token)) 
			break;
		last = sec;
	}

	/* If we found it, increment it and go */
	if (sec) {
		sec->count++;
		return;
	}

	/* Didn't find it.  Allocate a new one */
	sec = malloc(sizeof(Second));
	if (!sec) { printf("Out of memory\n"); exit(1); }

	sec->w = second;
	sec->count = 1;
	sec->next = NULL;

	// Link to the new object
	if (last)
		last->next = sec;
	else
		first->seconds = sec;
}


/* We have a new token.
 * Add it to the list of words in lastToken.
 */
void addToken(char *token) 
{
        Word *w = searchToken(token, TRUE);
	if (lastToken) 
	{
		incSecondList(lastToken, w);
	}

	lastToken = w;
}

int NextToken( char **sp )
/*	Gets the next token in s (if any) and processes it.
*	The search begins at *sp.  *sp is incremented so this routine
*	can be called in a loop.
*
*  RETURN VALUE
*	Whether there is anything else to process in this line.
*/
{
	char	ch;
	char   *s, *token;
	
    /* Skip blanks */    	
    
    for (s = *sp; *s && isspace(*s); s++)
    	;

    /* Token = part of s before next blank.
     * Either get a string of punctuation, or a string of non-punctuation.
     */
    if (isPunc(*s)) {
	    for (token = s; *s && isPunc(*s) && !isspace(*s); s++)
	    	;
    } else {
	    for (token = s; *s && !isPunc(*s) && !isspace(*s); s++)
    		*s = tolower(*s);
    }
    
    *sp = s;
    	
    if (s - token)	
    {
    	/* Temporarily add a null character at end of token */
    	
    	ch = *s;
    	*s = '\0';
        
        /* Process token */
        addToken(token);
	        
        *s = ch;
        
        return(TRUE);
    }
    else
    	return(FALSE);
    
} /*NextToken*/


// Read the corpus
void ReadFile( char *fn )
{
	FILE *f;
	char raw[512];
	char cooked[1024];
	char remainder[512];
	char *t;

	*remainder = '\0';	
	f = fopen( fn, "r" );
	if (!f)
	{
		printf( "Can’t open file %s\n", fn );
		return;
	}
	
	printf( "Reading file %s.\n", fn );
    
    while (fgets( raw, 100, f ))
    {
	/* Unix/Mac discrepancy with CRLF.  Remove both. */
	for (t = raw; *t; t++)
		if (*t == '\r' || *t == '\n')
			*t = ' ';

	/* Add remainder from previous line, if any */
	strcpy(cooked, remainder);
	strcat(cooked, raw);   

	/* Find the new remainder */
	t = cooked + strlen(cooked) - 1;
	while (t > cooked && !isspace(*t))
		t--;
	if (t == cooked)
		*remainder = '\0';
	else {
		strcpy(remainder, t + 1);
		*t = '\0';
	}

	/* Process tokens */   	
    	for (t = cooked; NextToken(&t); )
    		;
    }
          
    /* there might be something in remainder */
    for (t = remainder; NextToken(&t); )
	;

    fclose(f);
        
} /*ReadFile*/


/* Generate one word from this token */
Word *markovNode(Word *w) 
{
	if (!w) return(NULL);

	/* Get the total word count */
	int n = 0;
	Second *sec;
	for (sec = w->seconds; sec; sec = sec->next) 
		n += sec->count;

	/* Pick a random number between 0 and n - 1 */
	int tgt = rand() % n;

	/* Go through counts again, stopping when we hit tgt */
	n = 0;
	for (sec = w->seconds; sec; sec = sec->next) 
	{
		n += sec->count;
		if (n >= tgt) 
			return sec->w;
	}

	return NULL;
}

/* Generate a text */
void markovStart() 
{
	/* This is a kludge.
	 * Find the first word that has over 1000 second-words.
	 */
	int i;
	int c;
	for (i = 0; i < nToken; i++) {
		Second *sec = NULL;
		c = 0;
		for (sec = tokens[i]->seconds; sec; sec = sec->next) 
			c++;
		if (c > 100) break;
	}
	if (i >= nToken) { printf( "No start points found\n"); return; }

	printf("Initial token should be %s with a count of %i\n", tokens[i]->token, c );


	/* Generate a text */
	Word *w = tokens[i];
	for (i = 0; w && i < 250; i++) {
		char ch = w->token[0];
		if (!isPunc(ch)) 
			printf(" ");
		printf( "%s", w->token );
		if (ch == '.' || ch == '?' || ch == '!')
			printf("\n");
		w = markovNode(w);
	}

	printf("\n");
}

// Process
int main(int argc, char *argv[])
{
	int i;

	/* Initializes random number generator */
  	srand((unsigned) time(NULL));
	
	/* Read the input file(s) */
	if (argc == 1)
		ReadFile("4901");
	else 
		for (i = 1; i < argc; i++)
			ReadFile(argv[i]);

	printf("Total tokens in corpus: %i\n", nToken );

	/* Debug 
	for (i = 0; i < nToken; i++) {
		printf("%s\n", tokens[i]->token);
		Second *sec = NULL;
		for (sec = tokens[i]->seconds; sec; sec = sec->next) {
			printf(" %s %i\n", sec->w->token, sec->count );
		}
	}
	 */

	/* Generate text */
	markovStart();
}

Anon7 - 2021